sglang/.github/workflows/pr-test-pd-router.yml

name: PR Test (PD Router)

on:
  push:
    branches: [ main ]
    paths:
      - 'python/sglang/srt/disaggregation/**'
      - 'scripts/ci/ci_start_disaggregation_servers.sh'
      - 'sgl-router/**'
  pull_request:
    branches: [ main ]
    paths:
      - 'python/sglang/srt/disaggregation/**'
      - 'scripts/ci/ci_start_disaggregation_servers.sh'
      - 'sgl-router/**'
  workflow_dispatch:

concurrency:
  group: test-disaggregation-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read
  pull-requests: write
  issues: write

jobs:
  test-disaggregation:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
    runs-on: [h200]
    timeout-minutes: 45

    steps:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
        fetch-depth: 10

    - name: Setup Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.12'

    - name: Setup Rust
      run: |
        bash scripts/ci/ci_install_rust.sh

    - name: Cache Rust dependencies
      uses: actions/cache@v4
      with:
        path: |
          ~/.cargo/bin/
          ~/.cargo/registry/index/
          ~/.cargo/registry/cache/
          ~/.cargo/git/db/
          sgl-router/target/
        key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
        restore-keys: |
          ${{ runner.os }}-cargo-

    - name: Cache pip dependencies
      uses: actions/cache@v4
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }}
        restore-keys: |
          ${{ runner.os }}-pip-

    - name: Validate environment
      run: |
        echo "=== System Validation ==="
        nvidia-smi
        echo "GPU count: $(nvidia-smi -L | wc -l)"
        if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then
          echo "Error: This test requires at least 8 GPUs"
          exit 1
        fi

        echo "=== GPU Process Check ==="
        # Fail fast if any GPU compute processes are active
        if command -v nvidia-smi >/dev/null 2>&1; then
          # Try to query compute apps first (preferred and concise)
          gpu_procs=$(nvidia-smi --query-compute-apps=pid,process_name,gpu_uuid --format=csv,noheader 2>/dev/null | sed '/^$/d' || true)

          # Fallback to detailed PIDS report if the query returns nothing but there might still be processes
          if [ -z "$gpu_procs" ]; then
            gpu_procs=$(nvidia-smi -q -d PIDS 2>/dev/null | awk '/Processes/{flag=1;next}/^$/{flag=0}flag' | sed '/^\s*Processes:/d' | sed '/^\s*$/d' || true)
          fi

          if [ -n "$gpu_procs" ]; then
            echo "Error: Found active GPU processes using the device(s):"
            echo "$gpu_procs"
            exit 1
          else
            echo "No active GPU compute processes detected."
          fi
        else
          echo "Error: nvidia-smi not found; skipping GPU process check."
          exit 1
        fi

        echo "=== RDMA Validation ==="
        if ! command -v ibv_devices >/dev/null 2>&1; then
          echo "Error: InfiniBand tools not found"
          exit 1
        fi

        # Check for active IB devices
        found_active_device=false
        for device in mlx5_{0..11}; do
            if ibv_devinfo $device >/dev/null 2>&1; then
                state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
                if [[ "$state" == "PORT_ACTIVE" ]]; then
                    echo "✓ Found active device: $device"
                    found_active_device=true
                    break
                fi
            fi
        done

        if [ "$found_active_device" = false ]; then
          echo "Error: No active IB devices found"
          echo "Available devices:"
          ibv_devices || true
          exit 1
        fi

        echo "=== Model Validation ==="
        if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then
          echo "Error: Model not found"
          ls -la /raid/models/ || echo "No models directory"
          exit 1
        fi
        echo "✓ Model found"

    - name: Install SGLang dependencies
      run: |
        echo "Installing SGLang with all extras..."
        python3 -m pip --no-cache-dir install --upgrade pip
        python3 -m pip --no-cache-dir install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
        python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5
        python3 -m pip --no-cache-dir install --user --force-reinstall genai-bench==0.0.2
        python3 -m pip --no-cache-dir install sgl-kernel==0.3.9.post2

    - name: Build and install sgl-router
      run: |
        source "$HOME/.cargo/env"
        echo "Building sgl-router..."
        cd sgl-router
        cargo build && python3 -m build && pip install --force-reinstall dist/*.whl

    - name: Start disaggregation servers
      id: start_servers
      run: |
        echo "Starting disaggregation servers..."
        READY_FILE=".disagg_ready"
        rm -f "$READY_FILE"
        DISAGG_READY_FILE="$READY_FILE" bash scripts/ci/ci_start_disaggregation_servers.sh &
        SERVER_PID=$!
        echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT

        # Wait until script signals readiness (8/8 healthy) or timeout
        TIMEOUT=300
        ELAPSED=0
        while [ $ELAPSED -lt $TIMEOUT ]; do
          if [ -f "$READY_FILE" ]; then
            echo "✓ All disaggregation servers are healthy (signal detected)"
            break
          fi
          if ! ps -p $SERVER_PID > /dev/null; then
            echo "Error: server bootstrap script exited prematurely"
            exit 1
          fi
          sleep 5
          ELAPSED=$((ELAPSED + 5))
        done
        if [ $ELAPSED -ge $TIMEOUT ]; then
          echo "❌ Timeout waiting for disaggregation servers to be healthy"
          exit 1
        fi

        echo "✓ Servers started (PID: $SERVER_PID)"


    - name: Test all policies sequentially
      timeout-minutes: 30
      run: |
        POLICIES=("random" "round_robin" "cache_aware" "power_of_two")
        BASE_URL="http://127.0.0.9:8000"

        # Free commonly used ports for router and metrics
        echo "Freeing ports 29000 (metrics) and 8000 (API), if in use..."
        fuser -k -n tcp 29000 2>/dev/null || true
        fuser -k -n tcp 8000 2>/dev/null || true
        sleep 1

        for policy in "${POLICIES[@]}"; do
          echo ""
          echo "=================================================="
          echo "Testing policy: $policy"
          echo "=================================================="

          # Free ports before starting router
          fuser -k -n tcp 29000 2>/dev/null || true
          fuser -k -n tcp 8000 2>/dev/null || true

          # Start router with the current policy
          echo "Starting router with policy: $policy..."
          RUST_BACKTRACE=1 python3 -m sglang_router.launch_router \
            --pd-disaggregation \
            --policy "$policy" \
            --prefill http://127.0.0.1:30001 9001 \
            --prefill http://127.0.0.2:30002 9002 \
            --prefill http://127.0.0.3:30003 9003 \
            --prefill http://127.0.0.4:30004 9004 \
            --decode http://127.0.0.5:30005 \
            --decode http://127.0.0.6:30006 \
            --decode http://127.0.0.7:30007 \
            --decode http://127.0.0.8:30008 \
            --host 127.0.0.9 \
            --port 8000 &
          ROUTER_PID=$!

          # Wait for router to become healthy
          echo "Waiting for router to become healthy..."
          TIMEOUT=60
          ELAPSED=0
          while [ $ELAPSED -lt $TIMEOUT ]; do
            if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then
              echo "✓ Router is reachable"
              break
            fi
            if ! ps -p $ROUTER_PID > /dev/null; then
              echo "Error: Router process died"
              exit 1
            fi
            sleep 5
            ELAPSED=$((ELAPSED + 5))
          done

          if [ $ELAPSED -ge $TIMEOUT ]; then
            echo "Error: Router health check timeout"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Test API functionality
          echo "Testing API completions for $policy..."
          response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
            -H "Content-Type: application/json" \
            -H "Authorization: Bearer test-token" \
            -d '{
              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
              "messages": [
                {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"}
              ],
              "stream": false,
              "max_tokens": 100
            }')

          if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
            echo "✓ API test passed for $policy"
          else
            echo "✗ API test failed for $policy: $response"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Test streaming
          echo "Testing streaming API for $policy..."
          stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \
            -H "Content-Type: application/json" \
            -H "Authorization: Bearer test-token" \
            -d '{
              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
              "messages": [
                {"role": "user", "content": "Count from 1 to 5"}
              ],
              "stream": true,
              "max_tokens": 50
            }')

          if echo "$stream_response" | grep -q "data:"; then
            echo "✓ Streaming API test passed for $policy"
          else
            echo "✗ Streaming API test failed for $policy"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Run genai-bench benchmark
          echo "Running genai-bench for $policy..."
          genai-bench benchmark \
            --api-backend openai \
            --api-base "http://127.0.0.9:8000" \
            --api-key "dummy-token" \
            --api-model-name "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \
            --model-tokenizer /raid/models/meta-llama/Llama-3.1-8B-Instruct \
            --task text-to-text \
            --num-concurrency 64 \
            --traffic-scenario "D(8000,2000)" \
            --max-requests-per-run 640 \
            --max-time-per-run 2 \
            --experiment-folder-name "benchmark_${policy}" \
            --experiment-base-dir "."

          # Find the actual experiment folder
          actual_folder=$(find . -maxdepth 1 -name "benchmark_${policy}" -type d | head -1)

          if [ -n "$actual_folder" ]; then
            # Extract metrics from the Excel summary or JSON files
            summary_file="$actual_folder"/*_summary.xlsx
            json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata)

            echo "Genai-bench results saved in: $actual_folder"

            # Extract mean values and validate performance thresholds
            echo "📊 Extracting performance metrics for $policy..."

            # Find JSON files excluding experiment metadata
            json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata)

            if [ -n "$json_files" ]; then
              # Extract metrics using jq and validate against loose thresholds
              for json_file in $json_files; do
                echo "Processing: $(basename "$json_file")"

                                # Extract mean values for performance validation
                ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file")
                e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file")
                input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file")
                output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file")

                echo "  TTFT mean: ${ttft_mean}s"
                echo "  E2E Latency mean: ${e2e_latency_mean}s"
                echo "  Input Throughput mean: ${input_throughput_mean} tokens/s"
                echo "  Output Throughput mean: ${output_throughput_mean} tokens/s"

                # Set mean thresholds (allowing for reasonable variance)
                # These can be adjusted based on your performance requirements
                ttft_threshold=4.7          # Max 4.7 seconds for mean TTFT
                e2e_latency_threshold=35.0   # Max 35.0 seconds for mean E2E latency
                input_throughput_threshold=12000   # Min 12000 tokens/s for mean input throughput
                output_throughput_threshold=68    # Min 68 tokens/s for mean output throughput


                # Validate mean thresholds
                validation_passed=true

                if (( $(echo "$ttft_mean > $ttft_threshold" | bc -l) )); then
                  echo "❌ TTFT validation failed: $ttft_mean > $ttft_threshold"
                  validation_passed=false
                fi

                if (( $(echo "$e2e_latency_mean > $e2e_latency_threshold" | bc -l) )); then
                  echo "❌ E2E Latency validation failed: $e2e_latency_mean > $e2e_latency_threshold"
                  validation_passed=false
                fi

                if (( $(echo "$input_throughput_mean < $input_throughput_threshold" | bc -l) )); then
                  echo "❌ Input Throughput validation failed: $input_throughput_mean < $input_throughput_threshold"
                  validation_passed=false
                fi

                if (( $(echo "$output_throughput_mean < $output_throughput_threshold" | bc -l) )); then
                  echo "❌ Output Throughput validation failed: $output_throughput_mean < $output_throughput_threshold"
                  validation_passed=false
                fi

                if [ "$validation_passed" = true ]; then
                  echo "✅ Performance validation passed for $policy"
                else
                  echo "❌ Performance validation failed for $policy"
                  kill $ROUTER_PID 2>/dev/null || true
                  exit 1
                fi
              done

              echo "✓ Genai-bench completed successfully for $policy"
              echo "📊 Detailed metrics and plots available in: $actual_folder"
            else
              echo "✗ Benchmark failed for $policy: No JSON results found"
              kill $ROUTER_PID 2>/dev/null || true
              exit 1
            fi
          else
            echo "✗ Benchmark failed for $policy: Experiment folder not found"
            kill $ROUTER_PID 2>/dev/null || true
            exit 1
          fi

          # Stop router before testing next policy
          echo "Stopping router for $policy..."
          # First try graceful shutdown
          kill $ROUTER_PID 2>/dev/null || true

          # Wait up to 5 seconds for graceful shutdown
          for i in {1..5}; do
            if ! ps -p $ROUTER_PID > /dev/null 2>&1; then
              echo "Router stopped gracefully"
              break
            fi
            sleep 1
          done

          # Force kill if still running
          if ps -p $ROUTER_PID > /dev/null 2>&1; then
            echo "Force killing router..."
            kill -9 $ROUTER_PID 2>/dev/null || true
          fi

          # Short delay to ensure port is released
          sleep 2

          echo "✓ Completed testing for $policy"
        done

        echo ""
        echo "✅ All policies tested successfully!"


    - name: Upload benchmark results
      if: success()
      uses: actions/upload-artifact@v4
      with:
        name: genai-bench-results-all-policies
        path: benchmark_**/

    - name: Cleanup servers
      if: always()
      run: |
        if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then
          pkill -P ${{ steps.start_servers.outputs.server_pid }} || true
          kill ${{ steps.start_servers.outputs.server_pid }} || true
        fi
        pkill -f "sglang.launch_server" || true
        sleep 5
        remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0")
        echo "Cleanup completed. Remaining processes: $remaining"

  summarize-benchmarks:
    needs: test-disaggregation
    runs-on: ubuntu-latest
    if: success()

    steps:
    - name: Install jq
      run: sudo apt-get update && sudo apt-get install -y jq bc

    - name: Download benchmark results
      uses: actions/download-artifact@v4
      with:
        name: genai-bench-results-all-policies

    - name: List downloaded contents
      run: |
        echo "Contents after download:"
        ls -la
        find . -name "benchmark_*" -type d
        echo "JSON files found:"
        find . -name "*.json" | head -10

    - name: Create benchmark summary
      run: |
        echo "=== DEBUG: Creating benchmark summary ==="
        echo "Available benchmark directories:"
        find . -name "benchmark_*" -type d
        echo "=========================================="

        echo "## PD Router Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "🚀 **Benchmarked with genai-bench for comprehensive LLM serving performance evaluation**" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "| Policy | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
        echo "|--------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY

        # First, complete the table with all policies
        for policy in random round_robin cache_aware power_of_two; do
          # Find genai-bench result folders for this policy (handle zip extraction structure)
          result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1)
          if [ -z "$result_folder" ]; then
            # Try alternative patterns in case of different extraction structure
            result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1)
          fi

          echo "DEBUG: Policy ${policy} -> Found folder: ${result_folder:-'NOT FOUND'}"

          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
            # Find JSON file with metrics
            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)

            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
              # Extract performance metrics
              ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
              e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
              input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
              output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")

              # Format numbers for display (2 decimal places)
              if [ "$ttft_mean" != "N/A" ] && [ "$ttft_mean" != "null" ]; then
                ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean")
              else
                ttft_display="N/A"
              fi

              if [ "$e2e_latency_mean" != "N/A" ] && [ "$e2e_latency_mean" != "null" ]; then
                e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean")
              else
                e2e_display="N/A"
              fi

              if [ "$input_throughput_mean" != "N/A" ] && [ "$input_throughput_mean" != "null" ]; then
                input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean")
              else
                input_display="N/A"
              fi

              if [ "$output_throughput_mean" != "N/A" ] && [ "$output_throughput_mean" != "null" ]; then
                output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean")
              else
                output_display="N/A"
              fi

              echo "| ${policy} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY
            else
              echo "| ${policy} | ❌ No Data | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY
            fi
          else
            echo "| ${policy} | ❌ Failed | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY
          fi
        done

        # Add performance validation summary
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "## 📊 Performance Validation" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "**Thresholds:** TTFT ≤ 2.0s | E2E Latency ≤ 8.0s | Input Throughput ≥ 10,000 tok/s | Output Throughput ≥ 100 tok/s" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY

        validation_summary=""
        for policy in random round_robin cache_aware power_of_two; do
          # Use same robust path finding as above
          result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1)
          if [ -z "$result_folder" ]; then
            result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1)
          fi

          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
              # Extract metrics for validation
              ttft=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
              e2e_latency=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
              input_throughput=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
              output_throughput=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")

              # Check thresholds (using same values as in main workflow)
              validation_status="✅"
              if [ "$ttft" != "N/A" ] && [ "$ttft" != "null" ]; then
                if (( $(echo "$ttft > 2.0" | bc -l 2>/dev/null || echo "0") )); then
                  validation_status="❌"
                fi
              fi
              if [ "$e2e_latency" != "N/A" ] && [ "$e2e_latency" != "null" ]; then
                if (( $(echo "$e2e_latency > 24.0" | bc -l 2>/dev/null || echo "0") )); then
                  validation_status="❌"
                fi
              fi
              if [ "$input_throughput" != "N/A" ] && [ "$input_throughput" != "null" ]; then
                if (( $(echo "$input_throughput < 10000" | bc -l 2>/dev/null || echo "0") )); then
                  validation_status="❌"
                fi
              fi
              if [ "$output_throughput" != "N/A" ] && [ "$output_throughput" != "null" ]; then
                if (( $(echo "$output_throughput < 90" | bc -l 2>/dev/null || echo "0") )); then
                  validation_status="❌"
                fi
              fi

              validation_summary="${validation_summary}- **${policy}**: $validation_status\n"
            else
              validation_summary="${validation_summary}- **${policy}**: ❌ No data\n"
            fi
          else
            validation_summary="${validation_summary}- **${policy}**: ❌ Failed\n"
          fi
        done

        echo -e "$validation_summary" >> $GITHUB_STEP_SUMMARY

        echo "" >> $GITHUB_STEP_SUMMARY
        echo "## 📊 Genai-Bench Features Used" >> $GITHUB_STEP_SUMMARY
        echo "- **Token-level Performance**: TTFT, TPOT, End-to-End latency" >> $GITHUB_STEP_SUMMARY
        echo "- **Throughput Analysis**: Input/Output/Total token throughput" >> $GITHUB_STEP_SUMMARY
        echo "- **Statistical Analysis**: Percentiles, mean, std dev for all metrics" >> $GITHUB_STEP_SUMMARY
        echo "- **Visual Reports**: Automated plots and Excel summaries" >> $GITHUB_STEP_SUMMARY
        echo "- **SGLang Backend**: Native integration with SGLang serving" >> $GITHUB_STEP_SUMMARY
        echo "" >> $GITHUB_STEP_SUMMARY
        echo "✅ All policies tested successfully with genai-bench!" >> $GITHUB_STEP_SUMMARY