370 lines
12 KiB
YAML
370 lines
12 KiB
YAML
name: Test Disaggregation Mode
|
|
|
|
on:
|
|
push:
|
|
branches: [ main ]
|
|
paths:
|
|
- 'python/sglang/srt/disaggregation/**'
|
|
- 'scripts/ci_start_disaggregation_servers.sh'
|
|
- 'sgl-router/**'
|
|
pull_request:
|
|
branches: [ main ]
|
|
paths:
|
|
- 'python/sglang/srt/disaggregation/**'
|
|
- 'scripts/ci_start_disaggregation_servers.sh'
|
|
- 'sgl-router/**'
|
|
workflow_dispatch:
|
|
|
|
concurrency:
|
|
group: test-disaggregation-${{ github.ref }}
|
|
cancel-in-progress: true
|
|
|
|
permissions:
|
|
contents: read
|
|
pull-requests: write
|
|
issues: write
|
|
|
|
jobs:
|
|
test-disaggregation:
|
|
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
|
runs-on: [h200]
|
|
timeout-minutes: 45
|
|
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
with:
|
|
fetch-depth: 10
|
|
|
|
- name: Setup Python
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: '3.11'
|
|
|
|
- name: Setup Rust
|
|
run: |
|
|
bash scripts/ci_install_rust.sh
|
|
|
|
- name: Cache Rust dependencies
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: |
|
|
~/.cargo/bin/
|
|
~/.cargo/registry/index/
|
|
~/.cargo/registry/cache/
|
|
~/.cargo/git/db/
|
|
sgl-router/target/
|
|
key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
|
|
restore-keys: |
|
|
${{ runner.os }}-cargo-
|
|
|
|
- name: Cache pip dependencies
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cache/pip
|
|
key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }}
|
|
restore-keys: |
|
|
${{ runner.os }}-pip-
|
|
|
|
- name: Validate environment
|
|
run: |
|
|
echo "=== System Validation ==="
|
|
nvidia-smi
|
|
echo "GPU count: $(nvidia-smi -L | wc -l)"
|
|
if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then
|
|
echo "Error: This test requires at least 8 GPUs"
|
|
exit 1
|
|
fi
|
|
|
|
echo "=== RDMA Validation ==="
|
|
if ! command -v ibv_devices >/dev/null 2>&1; then
|
|
echo "Error: InfiniBand tools not found"
|
|
exit 1
|
|
fi
|
|
|
|
# Check for active IB devices
|
|
found_active_device=false
|
|
for device in mlx5_{0..11}; do
|
|
if ibv_devinfo $device >/dev/null 2>&1; then
|
|
state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
|
|
if [[ "$state" == "PORT_ACTIVE" ]]; then
|
|
echo "✓ Found active device: $device"
|
|
found_active_device=true
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ "$found_active_device" = false ]; then
|
|
echo "Error: No active IB devices found"
|
|
echo "Available devices:"
|
|
ibv_devices || true
|
|
exit 1
|
|
fi
|
|
|
|
echo "=== Model Validation ==="
|
|
if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then
|
|
echo "Error: Model not found"
|
|
ls -la /raid/models/ || echo "No models directory"
|
|
exit 1
|
|
fi
|
|
echo "✓ Model found"
|
|
|
|
- name: Install SGLang dependencies
|
|
run: |
|
|
echo "Installing SGLang with all extras..."
|
|
python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
|
|
python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5
|
|
|
|
- name: Build and install sgl-router
|
|
run: |
|
|
source "$HOME/.cargo/env"
|
|
echo "Building sgl-router..."
|
|
cd sgl-router
|
|
cargo build && python3 -m build && pip install --force-reinstall dist/*.whl
|
|
|
|
- name: Start disaggregation servers
|
|
id: start_servers
|
|
run: |
|
|
echo "Starting disaggregation servers..."
|
|
bash scripts/ci_start_disaggregation_servers.sh &
|
|
SERVER_PID=$!
|
|
echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT
|
|
|
|
# Wait for all 8 servers to be healthy (script already does this)
|
|
wait_count=0
|
|
while [ $wait_count -lt 30 ]; do
|
|
if ps -p $SERVER_PID > /dev/null; then
|
|
# Check if the startup script printed success message
|
|
sleep 2
|
|
wait_count=$((wait_count + 1))
|
|
else
|
|
# Script exited - check if it was successful
|
|
wait $SERVER_PID
|
|
exit_code=$?
|
|
if [ $exit_code -eq 0 ]; then
|
|
echo "✓ All disaggregation servers are healthy"
|
|
break
|
|
else
|
|
echo "Error: Server startup failed with code $exit_code"
|
|
exit 1
|
|
fi
|
|
fi
|
|
done
|
|
|
|
echo "✓ Servers started (PID: $SERVER_PID)"
|
|
|
|
- name: Test all policies sequentially
|
|
timeout-minutes: 30
|
|
run: |
|
|
POLICIES=("random" "round_robin" "cache_aware" "power_of_two")
|
|
BASE_URL="http://127.0.0.9:8000"
|
|
|
|
for policy in "${POLICIES[@]}"; do
|
|
echo ""
|
|
echo "=================================================="
|
|
echo "Testing policy: $policy"
|
|
echo "=================================================="
|
|
|
|
# Start router with the current policy
|
|
echo "Starting router with policy: $policy..."
|
|
python3 -m sglang_router.launch_router \
|
|
--pd-disaggregation \
|
|
--policy "$policy" \
|
|
--prefill http://127.0.0.1:30001 9001 \
|
|
--prefill http://127.0.0.2:30002 9002 \
|
|
--prefill http://127.0.0.3:30003 9003 \
|
|
--prefill http://127.0.0.4:30004 9004 \
|
|
--decode http://127.0.0.5:30005 \
|
|
--decode http://127.0.0.6:30006 \
|
|
--decode http://127.0.0.7:30007 \
|
|
--decode http://127.0.0.8:30008 \
|
|
--host 127.0.0.9 \
|
|
--port 8000 &
|
|
ROUTER_PID=$!
|
|
|
|
# Wait for router to become healthy
|
|
echo "Waiting for router to become healthy..."
|
|
TIMEOUT=60
|
|
ELAPSED=0
|
|
while [ $ELAPSED -lt $TIMEOUT ]; do
|
|
if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then
|
|
echo "✓ Router is reachable"
|
|
break
|
|
fi
|
|
if ! ps -p $ROUTER_PID > /dev/null; then
|
|
echo "Error: Router process died"
|
|
exit 1
|
|
fi
|
|
sleep 5
|
|
ELAPSED=$((ELAPSED + 5))
|
|
done
|
|
|
|
if [ $ELAPSED -ge $TIMEOUT ]; then
|
|
echo "Error: Router health check timeout"
|
|
kill $ROUTER_PID 2>/dev/null || true
|
|
exit 1
|
|
fi
|
|
|
|
# Test API functionality
|
|
echo "Testing API completions for $policy..."
|
|
response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer test-token" \
|
|
-d '{
|
|
"model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
|
|
"messages": [
|
|
{"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"}
|
|
],
|
|
"stream": false,
|
|
"max_tokens": 100
|
|
}')
|
|
|
|
if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
|
|
echo "✓ API test passed for $policy"
|
|
else
|
|
echo "✗ API test failed for $policy: $response"
|
|
kill $ROUTER_PID 2>/dev/null || true
|
|
exit 1
|
|
fi
|
|
|
|
# Test streaming
|
|
echo "Testing streaming API for $policy..."
|
|
stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer test-token" \
|
|
-d '{
|
|
"model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
|
|
"messages": [
|
|
{"role": "user", "content": "Count from 1 to 5"}
|
|
],
|
|
"stream": true,
|
|
"max_tokens": 50
|
|
}')
|
|
|
|
if echo "$stream_response" | grep -q "data:"; then
|
|
echo "✓ Streaming API test passed for $policy"
|
|
else
|
|
echo "✗ Streaming API test failed for $policy"
|
|
kill $ROUTER_PID 2>/dev/null || true
|
|
exit 1
|
|
fi
|
|
|
|
# Run benchmark
|
|
echo "Running benchmark for $policy..."
|
|
benchmark_output=$(python3 -m sglang.bench_one_batch_server \
|
|
--model-path "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \
|
|
--base-url "http://127.0.0.9:8000" \
|
|
--batch-size 8 \
|
|
--input-len 4096 \
|
|
--output-len 5 \
|
|
--skip-warmup)
|
|
|
|
echo "$benchmark_output"
|
|
|
|
# Save benchmark output
|
|
echo "$benchmark_output" > "benchmark_${policy}.txt"
|
|
|
|
# Extract and validate metrics
|
|
latency=$(echo "$benchmark_output" | grep "latency:" | awk '{print $2}' | sed 's/s//')
|
|
input_throughput=$(echo "$benchmark_output" | grep "input throughput:" | awk '{print $3}')
|
|
output_throughput=$(echo "$benchmark_output" | grep "output throughput:" | awk '{print $3}')
|
|
|
|
command -v bc >/dev/null || (apt-get update && apt-get install -y bc)
|
|
|
|
echo "Performance for $policy: ${latency}s | ${input_throughput} | ${output_throughput} tok/s"
|
|
|
|
# Validate performance
|
|
fail=""
|
|
(( $(echo "$latency > 1.5" | bc -l) )) && fail="Latency too high (${latency}s>1.5s) "
|
|
(( $(echo "$input_throughput < 20000" | bc -l) )) && fail="${fail}Input too low (${input_throughput}<20k) "
|
|
(( $(echo "$output_throughput < 1000" | bc -l) )) && fail="${fail}Output too low (${output_throughput}<1k) "
|
|
|
|
if [ -n "$fail" ]; then
|
|
echo "✗ Benchmark failed for $policy: $fail"
|
|
kill $ROUTER_PID 2>/dev/null || true
|
|
exit 1
|
|
else
|
|
echo "✓ Performance validation passed for $policy"
|
|
fi
|
|
|
|
# Stop router before testing next policy
|
|
echo "Stopping router for $policy..."
|
|
# First try graceful shutdown
|
|
kill $ROUTER_PID 2>/dev/null || true
|
|
|
|
# Wait up to 5 seconds for graceful shutdown
|
|
for i in {1..5}; do
|
|
if ! ps -p $ROUTER_PID > /dev/null 2>&1; then
|
|
echo "Router stopped gracefully"
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
# Force kill if still running
|
|
if ps -p $ROUTER_PID > /dev/null 2>&1; then
|
|
echo "Force killing router..."
|
|
kill -9 $ROUTER_PID 2>/dev/null || true
|
|
fi
|
|
|
|
# Short delay to ensure port is released
|
|
sleep 2
|
|
|
|
echo "✓ Completed testing for $policy"
|
|
done
|
|
|
|
echo ""
|
|
echo "✅ All policies tested successfully!"
|
|
|
|
|
|
- name: Upload benchmark results
|
|
if: success()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: benchmark-results-all-policies
|
|
path: benchmark_*.txt
|
|
|
|
- name: Cleanup servers
|
|
if: always()
|
|
run: |
|
|
if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then
|
|
pkill -P ${{ steps.start_servers.outputs.server_pid }} || true
|
|
kill ${{ steps.start_servers.outputs.server_pid }} || true
|
|
fi
|
|
pkill -f "sglang.launch_server" || true
|
|
sleep 5
|
|
remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0")
|
|
echo "Cleanup completed. Remaining processes: $remaining"
|
|
|
|
summarize-benchmarks:
|
|
needs: test-disaggregation
|
|
runs-on: ubuntu-latest
|
|
if: success()
|
|
|
|
steps:
|
|
- name: Download benchmark results
|
|
uses: actions/download-artifact@v4
|
|
with:
|
|
name: benchmark-results-all-policies
|
|
|
|
- name: Create benchmark summary
|
|
run: |
|
|
echo "## PD Router Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "| Policy | Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
|
|
echo "|--------|-------------|-------------------------|--------------------------|" >> $GITHUB_STEP_SUMMARY
|
|
|
|
for policy in random round_robin cache_aware power_of_two; do
|
|
if [ -f "benchmark_${policy}.txt" ]; then
|
|
latency=$(grep "latency:" "benchmark_${policy}.txt" | awk '{print $2}')
|
|
input_throughput=$(grep "input throughput:" "benchmark_${policy}.txt" | awk '{print $3}')
|
|
output_throughput=$(grep "output throughput:" "benchmark_${policy}.txt" | awk '{print $3}')
|
|
|
|
echo "| ${policy} | ${latency} | ${input_throughput} | ${output_throughput} |" >> $GITHUB_STEP_SUMMARY
|
|
fi
|
|
done
|
|
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "✅ All policies tested successfully!" >> $GITHUB_STEP_SUMMARY
|