sglang/.github/workflows/pr-test-pd-router.yml

name: Test Disaggregation Mode

on:
  push:
    branches: [ main ]
    paths:
      - 'python/sglang/srt/disaggregation/**'
      - 'scripts/ci_start_disaggregation_servers.sh'
      - 'sgl-router/**'
  pull_request:
    branches: [ main ]
    paths:
      - 'python/sglang/srt/disaggregation/**'
      - 'scripts/ci_start_disaggregation_servers.sh'
      - 'sgl-router/**'
  workflow_dispatch:

concurrency:
  group: test-disaggregation-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read
  pull-requests: write
  issues: write

jobs:
  test-disaggregation:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: [h200]
    timeout-minutes: 45

    steps:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
        fetch-depth: 10

    - name: Setup Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.11'

    - name: Setup Rust
      run: |
        bash scripts/ci_install_rust.sh

    - name: Cache Rust dependencies
      uses: actions/cache@v4
      with:
        path: |
          ~/.cargo/bin/
          ~/.cargo/registry/index/
          ~/.cargo/registry/cache/
          ~/.cargo/git/db/
          sgl-router/target/
        key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
        restore-keys: |
          ${{ runner.os }}-cargo-

    - name: Cache pip dependencies
      uses: actions/cache@v4
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }}
        restore-keys: |
          ${{ runner.os }}-pip-

    - name: Validate environment
      run: |
        echo "=== System Validation ==="
        nvidia-smi
        echo "GPU count: $(nvidia-smi -L | wc -l)"
        if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then
          echo "Error: This test requires at least 8 GPUs"
          exit 1
        fi

        echo "=== RDMA Validation ==="
        if ! command -v ibv_devices >/dev/null 2>&1; then
          echo "Error: InfiniBand tools not found"
          exit 1
        fi

        # Check for active IB devices
        found_active_device=false
        for device in mlx5_{0..11}; do
            if ibv_devinfo $device >/dev/null 2>&1; then
                state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
                if [[ "$state" == "PORT_ACTIVE" ]]; then
                    echo "✓ Found active device: $device"
                    found_active_device=true
                    break
                fi
            fi
        done

        if [ "$found_active_device" = false ]; then
          echo "Error: No active IB devices found"
          echo "Available devices:"
          ibv_devices || true
          exit 1
        fi

        echo "=== Model Validation ==="
        if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then
          echo "Error: Model not found"
          ls -la /raid/models/ || echo "No models directory"
          exit 1
        fi
        echo "✓ Model found"

    - name: Install SGLang dependencies
      run: |
        echo "Installing SGLang with all extras..."
        python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5

    - name: Build and install sgl-router
      run: |
        source "$HOME/.cargo/env"
        echo "Building sgl-router..."
        cd sgl-router
        cargo build && python3 -m build && pip install --force-reinstall dist/*.whl

    - name: Start disaggregation servers
      id: start_servers
      run: |
        echo "Starting disaggregation servers..."
        bash scripts/ci_start_disaggregation_servers.sh &
        SERVER_PID=$!
        echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT

        # Wait for all 8 servers to be healthy (script already does this)
        wait_count=0
        while [ $wait_count -lt 30 ]; do
          if ps -p $SERVER_PID > /dev/null; then
            # Check if the startup script printed success message
            sleep 2
            wait_count=$((wait_count + 1))
          else
            # Script exited - check if it was successful
            wait $SERVER_PID
            exit_code=$?
            if [ $exit_code -eq 0 ]; then
              echo "✓ All disaggregation servers are healthy"
              break
            else
              echo "Error: Server startup failed with code $exit_code"
              exit 1
            fi
          fi
        done

        echo "✓ Servers started (PID: $SERVER_PID)"

    - name: Test all policies sequentially
      timeout-minutes: 30
      run: |
        POLICIES=("random" "round_robin" "cache_aware" "power_of_two")
        BASE_URL="http://127.0.0.9:8000"

        for policy in "${POLICIES[@]}"; do
          echo ""
          echo "=================================================="
          echo "Testing policy: $policy"
          echo "=================================================="

          # Start router with the current policy
          echo "Starting router with policy: $policy..."
          python3 -m sglang_router.launch_router \
            --pd-disaggregation \
            --policy "$policy" \
            --prefill http://127.0.0.1:30001 9001 \
            --prefill http://127.0.0.2:30002 9002 \
            --prefill http://127.0.0.3:30003 9003 \
            --prefill http://127.0.0.4:30004 9004 \
            --decode http://127.0.0.5:30005 \
            --decode http://127.0.0.6:30006 \
            --decode http://127.0.0.7:30007 \
            --decode http://127.0.0.8:30008 \
            --host 127.0.0.9 \
            --port 8000 &
          ROUTER_PID=$!

          # Wait for router to become healthy
          echo "Waiting for router to become healthy..."
          TIMEOUT=60
          ELAPSED=0
          while [ $ELAPSED -lt $TIMEOUT ]; do
            if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then
              echo "✓ Router is reachable"
              break
            fi
            if ! ps -p $ROUTER_PID > /dev/null; then
              echo "Error: Router process died"
              exit 1
            fi
            sleep 5
            ELAPSED=$((ELAPSED + 5))
          done

          if [ $ELAPSED -ge $TIMEOUT ]; then
            echo "Error: Router health check timeout"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Test API functionality
          echo "Testing API completions for $policy..."
          response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
            -H "Content-Type: application/json" \
            -H "Authorization: Bearer test-token" \
            -d '{
              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
              "messages": [
                {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"}
              ],
              "stream": false,
              "max_tokens": 100
            }')

          if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
            echo "✓ API test passed for $policy"
          else
            echo "✗ API test failed for $policy: $response"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Test streaming
          echo "Testing streaming API for $policy..."
          stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \
            -H "Content-Type: application/json" \
            -H "Authorization: Bearer test-token" \
            -d '{
              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
              "messages": [
                {"role": "user", "content": "Count from 1 to 5"}
              ],
              "stream": true,
              "max_tokens": 50
            }')

          if echo "$stream_response" | grep -q "data:"; then
            echo "✓ Streaming API test passed for $policy"
          else
            echo "✗ Streaming API test failed for $policy"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Run benchmark
          echo "Running benchmark for $policy..."
          benchmark_output=$(python3 -m sglang.bench_one_batch_server \
            --model-path "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \
            --base-url "http://127.0.0.9:8000" \
            --batch-size 8 \
            --input-len 4096 \
            --output-len 5 \
            --skip-warmup)

          echo "$benchmark_output"

          # Save benchmark output
          echo "$benchmark_output" > "benchmark_${policy}.txt"

          # Extract and validate metrics
          latency=$(echo "$benchmark_output" | grep "latency:" | awk '{print $2}' | sed 's/s//')
          input_throughput=$(echo "$benchmark_output" | grep "input throughput:" | awk '{print $3}')
          output_throughput=$(echo "$benchmark_output" | grep "output throughput:" | awk '{print $3}')

          command -v bc >/dev/null || (apt-get update && apt-get install -y bc)

          echo "Performance for $policy: ${latency}s | ${input_throughput} | ${output_throughput} tok/s"

          # Validate performance
          fail=""
          (( $(echo "$latency > 1.5" | bc -l) )) && fail="Latency too high (${latency}s>1.5s) "
          (( $(echo "$input_throughput < 20000" | bc -l) )) && fail="${fail}Input too low (${input_throughput}<20k) "
          (( $(echo "$output_throughput < 1000" | bc -l) )) && fail="${fail}Output too low (${output_throughput}<1k) "

          if [ -n "$fail" ]; then
            echo "✗ Benchmark failed for $policy: $fail"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          else
            echo "✓ Performance validation passed for $policy"
          fi

          # Stop router before testing next policy
          echo "Stopping router for $policy..."
          # First try graceful shutdown
          kill $ROUTER_PID 2>/dev/null || true

          # Wait up to 5 seconds for graceful shutdown
          for i in {1..5}; do
            if ! ps -p $ROUTER_PID > /dev/null 2>&1; then
              echo "Router stopped gracefully"
              break
            fi
            sleep 1
          done

          # Force kill if still running
          if ps -p $ROUTER_PID > /dev/null 2>&1; then
            echo "Force killing router..."
            kill -9 $ROUTER_PID 2>/dev/null || true
          fi

          # Short delay to ensure port is released
          sleep 2

          echo "✓ Completed testing for $policy"
        done

        echo ""
        echo "✅ All policies tested successfully!"


    - name: Upload benchmark results
      if: success()
      uses: actions/upload-artifact@v4
      with:
        name: benchmark-results-all-policies
        path: benchmark_*.txt

    - name: Cleanup servers
      if: always()
      run: |
        if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then
          pkill -P ${{ steps.start_servers.outputs.server_pid }} || true
          kill ${{ steps.start_servers.outputs.server_pid }} || true
        fi
        pkill -f "sglang.launch_server" || true
        sleep 5
        remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0")
        echo "Cleanup completed. Remaining processes: $remaining"

  summarize-benchmarks:
    needs: test-disaggregation
    runs-on: ubuntu-latest
    if: success()

    steps:
    - name: Download benchmark results
      uses: actions/download-artifact@v4
      with:
        name: benchmark-results-all-policies

    - name: Create benchmark summary
      run: |
        echo "## PD Router Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "| Policy | Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
        echo "|--------|-------------|-------------------------|--------------------------|" >> $GITHUB_STEP_SUMMARY

        for policy in random round_robin cache_aware power_of_two; do
          if [ -f "benchmark_${policy}.txt" ]; then
            latency=$(grep "latency:" "benchmark_${policy}.txt" | awk '{print $2}')
            input_throughput=$(grep "input throughput:" "benchmark_${policy}.txt" | awk '{print $3}')
            output_throughput=$(grep "output throughput:" "benchmark_${policy}.txt" | awk '{print $3}')

            echo "| ${policy} | ${latency} | ${input_throughput} | ${output_throughput} |" >> $GITHUB_STEP_SUMMARY
          fi
        done

        echo "" >> $GITHUB_STEP_SUMMARY
        echo "✅ All policies tested successfully!" >> $GITHUB_STEP_SUMMARY