diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index cb08ec534..098be925e 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -290,6 +290,25 @@ jobs: run: | bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600 + unit-test-backend-8-gpu-CAR-amd: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + strategy: + matrix: + runner: [linux-mi300-gpu-8] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Start CI container + run: bash scripts/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/amd_ci_install_dependency.sh + - name: Run CustomAllReduce test timeout-minutes: 20 run: | diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py index a1d28f2fc..0a506d35f 100644 --- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py +++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py @@ -398,7 +398,7 @@ class CustomAllreduce: else: # If warm up, mimic the allocation pattern since custom # allreduce is out-of-place. - return torch.empty_like(input) + return torch.zeros_like(input) else: if _is_hip: # note: outside of cuda graph context,