[AMD] Add unit-test-sgl-kernel-amd to AMD CI (#7539)
This commit is contained in:
36
.github/workflows/pr-test-amd.yml
vendored
36
.github/workflows/pr-test-amd.yml
vendored
@@ -290,12 +290,46 @@ jobs:
|
||||
run: |
|
||||
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
|
||||
|
||||
- name: Run CustomAllReduce test
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/amd_ci_exec.sh python3 -m unittest test_custom_allreduce.TestCustomAllReduce
|
||||
|
||||
unit-test-sgl-kernel-amd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
github.event.pull_request.draft == false
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
|
||||
runs-on: ${{matrix.runner}}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Start CI container
|
||||
run: bash scripts/amd_ci_start_container.sh
|
||||
env:
|
||||
GITHUB_WORKSPACE: ${{ github.workspace }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/amd_ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
|
||||
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
|
||||
docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
|
||||
|
||||
finish:
|
||||
if: always()
|
||||
needs: [
|
||||
accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
|
||||
accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd,
|
||||
unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd
|
||||
unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd,
|
||||
unit-test-sgl-kernel-amd
|
||||
]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
|
||||
@@ -19,3 +19,4 @@ mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpubli
|
||||
docker cp ./dummy-grok ci_sglang:/
|
||||
|
||||
docker exec ci_sglang pip install huggingface_hub[hf_xet]
|
||||
docker exec ci_sglang pip install pytest
|
||||
|
||||
@@ -56,22 +56,30 @@ def multi_process_parallel(
|
||||
|
||||
|
||||
class TestCustomAllReduce(CustomTestCase):
|
||||
TEST_SIZES = [
|
||||
512,
|
||||
4096,
|
||||
32768,
|
||||
262144,
|
||||
2097152,
|
||||
16777216,
|
||||
33554432,
|
||||
] # 512B...32MB
|
||||
WORLD_SIZES = [2, 4, 6, 8]
|
||||
TEST_LOOP = 10
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
random.seed(42)
|
||||
# 512B to 32MB
|
||||
cls.test_sizes = [512, 4096, 32768, 262144, 2097152, 16777216, 33554432]
|
||||
cls.world_sizes = [2, 4, 6, 8]
|
||||
cls.test_loop = 10
|
||||
random.seed(42) # keep the deterministic seed
|
||||
|
||||
def test_graph_allreduce(self):
|
||||
for world_size in self.world_sizes:
|
||||
for world_size in self.WORLD_SIZES:
|
||||
if world_size > torch.cuda.device_count():
|
||||
continue
|
||||
multi_process_parallel(world_size, self, self.graph_allreduce)
|
||||
|
||||
def test_eager_allreduce(self):
|
||||
for world_size in self.world_sizes:
|
||||
for world_size in self.WORLD_SIZES:
|
||||
if world_size > torch.cuda.device_count():
|
||||
continue
|
||||
multi_process_parallel(world_size, self, self.eager_allreduce)
|
||||
@@ -102,9 +110,9 @@ class TestCustomAllReduce(CustomTestCase):
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
|
||||
for sz in self.test_sizes:
|
||||
for sz in self.TEST_SIZES:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
for _ in range(self.test_loop):
|
||||
for _ in range(self.TEST_LOOP):
|
||||
with graph_capture() as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(
|
||||
@@ -151,9 +159,9 @@ class TestCustomAllReduce(CustomTestCase):
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
group = get_tensor_model_parallel_group().device_group
|
||||
|
||||
for sz in self.test_sizes:
|
||||
for sz in self.TEST_SIZES:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
for _ in range(self.test_loop):
|
||||
for _ in range(self.TEST_LOOP):
|
||||
inp1 = torch.randint(
|
||||
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user