fix: enable multi-GPU Triton fused MoE tuning (#6295)

2025-08-19 20:16:58 +03:00
parent 94959237bf
commit a3b810ebdb
1 changed files with 42 additions and 36 deletions
--- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
+++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -3,6 +3,7 @@ import argparse
 import json
 import time
 from datetime import datetime
 from contextlib import nullcontext
 from typing import Any, Dict, List, Tuple, TypedDict
 import ray
@@ -21,7 +22,7 @@ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
 )
 from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
 from sglang.srt.layers.moe.topk import TopKConfig, select_experts
-from sglang.srt.utils import is_hip
+from sglang.srt.utils import is_hip, is_rocm
 _is_hip = is_hip()
@@ -245,6 +246,9 @@ class BenchmarkWorker:
        torch.set_default_device("cuda")
        torch.cuda.manual_seed_all(0)
        self.seed = seed
        # Get the device ID to allocate tensors and kernels
        # on the respective GPU.
        self.device_id = int(ray.get_gpu_ids()[0])
    def benchmark(
        self,
@@ -283,6 +287,7 @@ class BenchmarkWorker:
            )
        else:
            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
        with torch.cuda.device(self.device_id) if is_rocm() else nullcontext():
            kernel_time = benchmark_config(
                config,
                num_tokens,
@@ -314,6 +319,7 @@ class BenchmarkWorker:
    ) -> Dict[str, int]:
        best_config = None
        best_time = float("inf")
        with torch.cuda.device(self.device_id) if is_rocm() else nullcontext():
            for config in tqdm(search_space):
                try:
                    kernel_time = benchmark_config(