add vxpu

2026-02-05 19:36:06 +08:00
parent 070bfa4a73
commit e273ef01b8
131 changed files with 28539 additions and 2 deletions
--- a/.github_bak/ISSUE_TEMPLATE/001_feature_request.yaml
+++ b/.github_bak/ISSUE_TEMPLATE/001_feature_request.yaml
@@ -0,0 +1,60 @@
+name: Feature Request
+description: Propose a new feature or enhancement
+labels: ["feature request"]
+title: "feature: <short summary>"
+body:
+  - type: markdown
+    attributes:
+      value: |
+        👋 **Thanks for taking the time to submit a feature request!**
+
+        Please help us understand your idea clearly by filling out the sections below.
+        Well-described proposals are easier to evaluate and more likely to be prioritized.
+
+  - type: textarea
+    attributes:
+      label: Feature Description
+      description: |
+        Describe the feature or enhancement you are proposing.
+        What should it do? What problem does it solve?
+        Please be as clear and specific as possible.
+      placeholder: |
+        Example:
+        - Add support for XXX
+        - Improve performance of YYY under ZZZ workload
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Motivation / Use Case
+      description: |
+        Why do you need this feature?
+        Describe the real-world use case, workload, or scenario where this feature is important.
+      placeholder: |
+        Example:
+        - This feature is required to support large-scale inference
+        - Needed to reduce TTFT/TPOT under high concurrency
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: Additional Context
+      description: |
+        Add any additional context that may help us evaluate this request.
+        This can include design ideas, references, benchmarks, logs, or related issues/PRs.
+      placeholder: |
+        - Related issues or PRs
+        - Links to papers or docs
+        - Performance numbers or expected impact
+    validations:
+      required: false
+
+  - type: markdown
+    attributes:
+      value: |
+        👍 **Community feedback matters!**
+
+        If you find this proposal useful, feel free to give it a 👍.
+        We often prioritize feature requests with strong community interest.
--- a/.github_bak/ISSUE_TEMPLATE/002_bug_report.yaml
+++ b/.github_bak/ISSUE_TEMPLATE/002_bug_report.yaml
@@ -0,0 +1,74 @@
+name: Bug Report
+description: Report a bug or unexpected behavior
+labels: ["bug"]
+title: "bug: <short summary>"
+body:
+  - type: markdown
+    attributes:
+      value: |
+        🐞 **Thanks for reporting a bug!**
+
+        To help us investigate and fix the issue efficiently, please provide as much
+        relevant information as possible. Clear and reproducible reports are highly appreciated.
+
+  - type: textarea
+    attributes:
+      label: Bug Description
+      description: |
+        Clearly and concisely describe the bug.
+        What happened? What is broken or behaving incorrectly?
+      placeholder: |
+        Example:
+        - vLLM crashes when loading model XXX
+        - Unexpected latency spike during decode stage
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Steps to Reproduce
+      description: |
+        Provide the exact steps to reproduce the issue.
+        Please include commands, configuration, and minimal repro if possible.
+      placeholder: |
+        Example:
+        1. Start vLLM with config XXX
+        2. Send request YYY
+        3. Observe error or incorrect behavior
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Expected Behavior
+      description: |
+        Describe what you expected to happen instead.
+        This helps clarify whether the behavior is incorrect or just unexpected.
+      placeholder: |
+        Example:
+        - Model should load successfully
+        - Latency should remain stable under N requests
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: Additional Context
+      description: |
+        Add any additional information that may help diagnose the issue.
+        This can include logs, stack traces, environment details, or related issues.
+      placeholder: |
+        - Logs / stack traces
+        - OS, CUDA, driver, hardware info
+        - vLLM / Kunlun version
+        - Related issues or PRs
+    validations:
+      required: false
+
+  - type: markdown
+    attributes:
+      value: |
+        👍 **Does this bug affect you as well?**
+
+        Please consider giving it a 👍.
+        We often prioritize issues that impact a larger portion of the community.
--- a/.github_bak/ISSUE_TEMPLATE/config.yml
+++ b/.github_bak/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: true
+
+contact_links:
+  - name: 📘 Documentation & Tutorials
+    url: https://vllm-kunlun.readthedocs.io
+    about: |
+      Please check the official documentation and tutorials first.
+      Many common questions, usage examples, and best practices are covered there.
--- a/.github_bak/PULL_REQUEST_TEMPLATE.md
+++ b/.github_bak/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,85 @@
+## PR Description
+
+<!--
+Please provide a clear and concise description of this PR:
+- What problem does it solve?
+- Why is this change needed?
+- What is the overall approach?
+-->
+
+FIX #xxxx  
+<!-- Link the existing issue(s) this PR resolves, e.g.:
+FIX #1234
+-->
+
+---
+
+## Checklist (Required)
+
+Before submitting this PR, please ensure that all the following items are completed:
+
+- [ ] All code changes pass the [`pre-commit`](https://github.com/baidu/vLLM-Kunlun/blob/main/CONTRIBUTING.md) checks.
+- [ ] Commits are signed off using `git commit -s`.
+- [ ] The PR title is properly classified (see below).
+
+---
+
+## PR Type
+
+Please prefix the PR title with one or more of the following labels to help reviewers quickly understand the nature of the change:
+
+- `[Feature]` – New features or enhancements (e.g. Attention, Communicator, Kernel, Worker, etc.)
+- `[Bugfix]` – Bug fixes
+- `[CI/Build]` – CI, build system, or infrastructure improvements
+- `[Doc]` – Documentation updates or fixes
+- `[Misc]` – Other changes that do not fit the above categories (use sparingly)
+
+> **Note:** If the PR spans multiple categories, include all relevant prefixes.
+
+---
+
+<details>
+<summary><b>Detailed Checklist (Click to Expand)</b></summary>
+
+<p>Thank you for contributing to <b>vLLM Kunlun</b>!  
+To help us maintain high code quality and streamline the review process, please ensure your PR meets the following requirements.</p>
+
+<h3>1. Code Quality</h3>
+
+<ul>
+    <li>All linting and formatting checks pass (<code>pre-commit</code>).</li>
+    <li>The code is well-structured and sufficiently documented.</li>
+    <li>The change is designed with maintainability and readability in mind.</li>
+</ul>
+
+<h3>2. Testing</h3>
+
+<ul>
+    <li>Relevant unit tests are added or updated.</li>
+    <li>Integration tests are included when applicable.</li>
+    <li>Existing tests continue to pass.</li>
+</ul>
+
+<h3>3. DCO Compliance</h3>
+
+<p>This project follows the
+<a href="https://github.com/vllm-project/vllm/blob/main/DCO">Developer Certificate of Origin (DCO)</a>.</p>
+
+<ul>
+    <li>All commits include a <code>Signed-off-by:</code> line.</li>
+    <li>Use <code>git commit -s</code> to automatically add the sign-off.</li>
+</ul>
+
+<h3>4. Review Expectations</h3>
+
+<p>During the review process, maintainers may:</p>
+
+<ul>
+    <li>Request code refactoring or additional tests.</li>
+    <li>Ask for clarifications on design decisions.</li>
+    <li>Suggest performance, stability, or maintainability improvements.</li>
+</ul>
+
+<p>We appreciate your patience and collaboration throughout the review process!</p>
+
+</details>
--- a/.github_bak/workflows/pylint-check.yml
+++ b/.github_bak/workflows/pylint-check.yml
@@ -0,0 +1,54 @@
+name: Code Style Check
+
+on:
+  pull_request:
+  push:
+    branches: [ main ]
+jobs:
+  pylint-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install tools
+        run: |
+          pip install black isort ruff
+
+      - name: Get changed python files
+        id: changed
+        shell: bash
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            files=$(git diff --name-only --diff-filter=ACMRT origin/${{ github.base_ref }}...HEAD)
+          else
+            files=$(git diff --name-only --diff-filter=ACMRT HEAD~1...HEAD)
+          fi
+
+          files=$(echo "$files" | grep '\.py$' || true)
+
+          echo "files<<EOF" >> $GITHUB_OUTPUT
+          echo "$files" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Run ruff
+        if: steps.changed.outputs.files != ''
+        run: |
+          echo "${{ steps.changed.outputs.files }}" | xargs ruff check --fix
+        continue-on-error: true
+
+      - name: Run black
+        if: steps.changed.outputs.files != ''
+        run: |
+          echo "${{ steps.changed.outputs.files }}" | xargs black --check
+
+      - name: Run isort
+        if: steps.changed.outputs.files != ''
+        run: |
+          echo "${{ steps.changed.outputs.files }}" | xargs isort --check-only --profile black
--- a/.github_bak/workflows/run-e2e.yml
+++ b/.github_bak/workflows/run-e2e.yml
@@ -0,0 +1,161 @@
+# name: e2e-test
+
+# on:
+#   workflow_call:
+#   pull_request:
+#     branches: [main]
+#     types: [opened, synchronize, reopened]
+#   push:
+#     branches: [main]
+
+# concurrency:
+#   group: e2e-singlecard
+#   cancel-in-progress: false
+
+# jobs:
+#   e2e:
+#     name: e2e-test-singlecard
+#     runs-on:
+#       - self-hosted
+#       - Linux
+#       - X64
+
+#     steps:
+#       - name: Checkout PR code
+#         uses: actions/checkout@v4
+#         with:
+#           fetch-depth: 0
+
+#       - name: Verify PR workspace
+#         run: |
+#           echo "===== WORKSPACE ====="
+#           pwd
+#           ls -l
+#           echo "===== GIT INFO ====="
+#           git rev-parse HEAD
+#           git log -1 --oneline
+#           git status --porcelain
+
+#       - name: Start docker
+#         run: |
+#           bash ci/scripts/docker/start_docker.sh
+
+#       - name: Install enviroments
+#         env: 
+#           PROXY_URL: ${{ secrets.PROXY_URL }}
+#           NO_PROXY_LIST: ${{ secrets.NO_PROXY_LIST }}
+#         run: |
+#           bash ci/scripts/env/install_env.sh
+
+#       - name: Start vLLM server
+#         run: |
+#           bash ci/scripts/server/start_vllm.sh
+
+#       - name: Wait for vLLM ready
+#         run: |
+#           bash ci/scripts/server/wait_vllm.sh
+
+#       - name: API Test
+#         run: |
+#           docker exec aiak-e2e-singlecard bash -lc '
+#             curl http://localhost:8356/v1/chat/completions \
+#               -H "Content-Type: application/json" \
+#               -d @- << "EOF"
+#           {
+#             "model": "Qwen3-8B",
+#             "messages": [
+#               { "role": "user", "content": "Who are you?" }
+#             ],
+#             "max_tokens": 200,
+#             "temperature": 0
+#           }
+#           EOF
+#           '
+
+      # - name: Accuracy testing
+      #   run: |
+      #     bash ci/scripts/tests/run_accuracy.sh
+
+      # - name: Performance testing
+      #   run: |
+      #     docker exec aiak-e2e-singlecard bash -lc '
+      #       source ci/scripts/common/env.sh
+      #       source ci/scripts/common/log.sh
+      #       #!/bin/bash
+      #       # ==========================================
+      #       # 1. Define test dimensions
+      #       #    (can be easily extended, e.g., add "2048x2048")
+      #       # ==========================================
+      #       DIMENSIONS=("1024x1024")
+
+      #       # ==========================================
+      #       # 2. Define concurrency generation logic (densification strategy)
+      #       # ============x==============================
+      #       # Use array concatenation to combine different density ranges
+      #       # Syntax: seq [start] [step] [end]
+      #       CONCURRENCIES=(1)
+
+      #       # ==========================================
+      #       # 3. Automatically assemble test cases
+      #       # ==========================================
+      #       TEST_COMBINATIONS=() # Initialize empty array
+
+      #       # 🔄 Modified: outer loop over batch size (concurrency), inner loop over dimensions
+      #       for bs in "${CONCURRENCIES[@]}"; do    # ← outer loop: concurrency
+      #           for dim in "${DIMENSIONS[@]}"; do  # ← inner loop: dimensions
+      #               case_str="${bs}x${dim}"
+      #               TEST_COMBINATIONS+=("$case_str")
+      #           done
+      #       done
+
+      #       # ==========================================
+      #       # 4. (Optional) Print generated cases for sanity check
+      #       # ==========================================
+      #       echo "Generated ${#TEST_COMBINATIONS[@]} test cases in total:"
+      #       echo "${TEST_COMBINATIONS[@]}" # Uncomment if you want to print all cases
+
+      #       # Progress counters
+      #       TOTAL_TESTS=${#TEST_COMBINATIONS[@]}
+      #       CURRENT_TEST=0
+
+      #       # Iterate over all test combinations
+      #       for COMBINATION in "${TEST_COMBINATIONS[@]}"; do
+      #           # Parse parameters from combination string
+      #           NUM_PROMPTS=$(echo $COMBINATION | cut -d'x' -f1)
+      #           INPUT_LEN=$(echo $COMBINATION | cut -d'x' -f2)
+      #           OUTPUT_LEN=$(echo $COMBINATION | cut -d'x' -f3)
+
+      #           # Update progress
+      #           CURRENT_TEST=$((CURRENT_TEST + 1))
+
+      #           echo "=========================================================="
+      #           echo "Test progress: $CURRENT_TEST / $TOTAL_TESTS"
+      #           echo "Current configuration: concurrency=$NUM_PROMPTS, input_len=$INPUT_LEN, output_len=$OUTPUT_LEN"
+      #           echo "=========================================================="
+
+      #           #OUTPUT_FILE="$RESULT_DIR/p800_${NUM_PROMPTS}_${INPUT_LEN}_${OUTPUT_LEN}.log"
+
+      #           # Run benchmark
+      #           python3 -m vllm.entrypoints.cli.main bench serve \
+      #               --host 127.0.0.1 \
+      #               --port ${VLLM_PORT:-8356}\
+      #               --backend vllm \
+      #               --model ${SERVED_MODEL_NAME:-Qwen3-8B} \
+      #               --dataset-name random \
+      #               --num-prompts $NUM_PROMPTS \
+      #               --random-input-len $INPUT_LEN \
+      #               --random-output-len $OUTPUT_LEN \
+      #               --tokenizer ${MODEL_PATH:-/ssd3/models/Qwen3-8B} \
+      #               --ignore-eos
+      #       done
+      #     '
+      
+      - name: Set permissions
+        if: always()
+        run: |
+          bash ci/scripts/docker/set_permissions.sh
+
+      - name: Cleanup docker
+        if: always()
+        run: |
+          bash ci/scripts/docker/stop_docker.sh
--- a/.github_bak/workflows/ut.yml
+++ b/.github_bak/workflows/ut.yml
@@ -0,0 +1,53 @@
+# name: Unit Test
+
+# on:
+#   pull_request:
+#     branches:
+#       - main
+
+# jobs:
+#   test-kunlun:
+#     runs-on: 
+#       labels:
+#         - self-hosted
+#         - Linux
+#         - X64
+#         - test-1 # Actions Runner Label
+        
+#     steps:
+#       - name: Checkout Code
+#         uses: actions/checkout@v4
+
+#       - name: Install vLLM-Kunlun Dependencies
+#         run: |
+#           pip install -r requirements.txt
+          
+#           python setup.py build
+#           python setup.py develop
+
+#           # Install the KL3-customized build of PyTorch
+#           wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xpytorch-cp310-torch251-ubuntu2004-x64.run?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-02T05%3A01%3A27Z%2F-1%2Fhost%2Ff3cf499234f82303891aed2bcb0628918e379a21e841a3fac6bd94afef491ff7
+#           bash xpytorch-cp310-torch251-ubuntu2004-x64.run
+
+#           # Install custom ops
+#           pip install "https://baidu-kunlun-public.su.bcebos.com/v1/baidu-kunlun-share/1130/xtorch_ops-0.1.2209%2B6752ad20-cp310-cp310-linux_x86_64.whl?authorization=bce-auth-v1%2FALTAKypXxBzU7gg4Mk4K4c6OYR%2F2025-12-05T06%3A18%3A00Z%2F-1%2Fhost%2F14936c2b7e7c557c1400e4c467c79f7a9217374a7aa4a046711ac4d948f460cd"
+
+#           # Install the KLX3 custom Triton build
+#           pip install "https://cce-ai-models.bj.bcebos.com/v1/vllm-kunlun-0.11.0/triton-3.0.0%2Bb2cde523-cp310-cp310-linux_x86_64.whl"
+
+#           # Install the AIAK custom ops library
+#           pip install "https://cce-ai-models.bj.bcebos.com/XSpeedGate-whl/release_merge/20251219_152418/xspeedgate_ops-0.0.0-cp310-cp310-linux_x86_64.whl"
+
+#       - name: Install vLLM
+#         run: |
+#           pip install vllm==0.11.0 --no-build-isolation --no-deps --no-deps --index-url https://pip.baidu-int.com/simple/
+      
+#       - name: Run Unit Test
+#         run: |
+#           echo "Running full suite..."
+#           export XPU_VISIBLE_DEVICES=1
+#           pytest \
+#           -vs \
+#           --cov=vllm_kunlun \
+#           --cov-report=term-missing \
+#           -p no:warnings tests/ut