diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml new file mode 100644 index 000000000..c1a7e9c17 --- /dev/null +++ b/.github/workflows/nightly-eval.yml @@ -0,0 +1,66 @@ +name: Nightly Evaluation + +on: + schedule: + - cron: '0 0 * * *' + push: + branches: + - main + paths: + - "python/sglang/version.py" + workflow_dispatch: + +concurrency: + group: nightly-eval-${{ github.ref }} + cancel-in-progress: true + +jobs: + meta-llama-31-8b-instruct: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[dev]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + git clone https://github.com/EleutherAI/lm-evaluation-harness + pushd lm-evaluation-harness + pip install -e . + pip install lm_eval[api] + popd + + - name: Run eval + timeout-minutes: 20 + run: | + python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache & + + echo "Waiting for server to start..." + for i in {1..120}; do + if curl -s http://127.0.0.1:30000/health; then + echo "Server is up!" + break + fi + if [ $i -eq 120 ]; then + echo "Server failed to start within 120 seconds" + exit 1 + fi + sleep 1 + done + + lm_eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=128,max_retries=3,tokenized_requests=False + + echo "Stopping server..." + kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}') + + finish: + needs: [ + meta-llama-31-8b-instruct + ] + runs-on: ubuntu-latest + steps: + - name: Finish + run: echo "This is an empty step to ensure that all jobs are completed."