From d364b9b0f26de4a10d77174f0542858eb5db6472 Mon Sep 17 00:00:00 2001 From: HAI Date: Mon, 28 Apr 2025 11:01:20 -0700 Subject: [PATCH] ROCm: update AITER (#5816) --- .github/workflows/pr-test-amd.yml | 12 +++--- 3rdparty/amd/tuning/benchmark_moe_rocm.py | 2 +- docker/Dockerfile.rocm | 4 +- .../layers/moe/fused_moe_triton/fused_moe.py | 4 +- .../srt/layers/moe/fused_moe_triton/layer.py | 32 +++++++------- python/sglang/srt/layers/quantization/fp8.py | 42 +++++++++---------- .../srt/layers/quantization/fp8_utils.py | 4 +- 7 files changed, 48 insertions(+), 52 deletions(-) diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index e8071119e..0a0124c60 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -38,12 +38,12 @@ jobs: else DEVICE_FLAG="--device /dev/dri" fi - docker pull lmsysorg/sglang:v0.4.5.post3-rocm630 + docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.5.post3-rocm630 + ghcr.io/saienduri/sglang-aiter-v0.1.1:428 - name: Install dependencies run: | @@ -82,12 +82,12 @@ jobs: else DEVICE_FLAG="--device /dev/dri" fi - docker pull lmsysorg/sglang:v0.4.5.post3-rocm630 + docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \ -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.5.post3-rocm630 + ghcr.io/saienduri/sglang-aiter-v0.1.1:428 - name: Install dependencies run: | @@ -120,12 +120,12 @@ jobs: else DEVICE_FLAG="--device /dev/dri" fi - docker pull lmsysorg/sglang:v0.4.5.post3-rocm630 + docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428 docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ -w /sglang-checkout --name ci_sglang \ - lmsysorg/sglang:v0.4.5.post3-rocm630 + ghcr.io/saienduri/sglang-aiter-v0.1.1:428 - name: Install dependencies run: | diff --git a/3rdparty/amd/tuning/benchmark_moe_rocm.py b/3rdparty/amd/tuning/benchmark_moe_rocm.py index 5aff8c0d6..af596d218 100644 --- a/3rdparty/amd/tuning/benchmark_moe_rocm.py +++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py @@ -15,7 +15,7 @@ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import ( get_config_file_name, ) -padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0 +padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0 def main(model, tp_size, dtype: str, batches): diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index d8a2bd1c5..d09d3be13 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -18,7 +18,7 @@ ARG TRITON_COMMIT="improve_fa_decode_3.0.0" ARG AITER_REPO="https://github.com/ROCm/aiter.git" -ARG AITER_COMMIT="testx" +ARG AITER_COMMIT="v0.1.1" RUN git clone ${SGL_REPO} \ && cd sglang \ @@ -74,7 +74,7 @@ ENV SGLANG_SET_CPU_AFFINITY=1 ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 ENV NCCL_MIN_NCHANNELS=112 -ENV MOE_PADDING=1 +ENV SGLANG_MOE_PADDING=1 ENV VLLM_FP8_PADDING=1 ENV VLLM_FP8_ACT_PADDING=1 ENV VLLM_FP8_WEIGHT_PADDING=1 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index da541f542..823bcb2fc 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -45,7 +45,7 @@ if _is_cuda or _is_hip: logger = logging.getLogger(__name__) -padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0 +padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0 enable_moe_align_block_size_triton = bool( int(os.getenv("ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) ) @@ -1327,7 +1327,7 @@ def fused_experts_impl( if ( not (use_fp8_w8a8 or use_int8_w8a8) or block_shape is not None - or (_is_hip and get_bool_env_var("CK_MOE")) + or (_is_hip and get_bool_env_var("SGLANG_AITER_MOE")) ): padded_size = 0 diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 6ce240cb3..12194ba18 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -18,7 +18,7 @@ from sglang.srt.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) -from sglang.srt.utils import get_bool_env_var, is_hip, permute_weight, set_weight_attrs +from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs if torch.cuda.is_available(): from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts @@ -30,7 +30,9 @@ import logging _is_hip = is_hip() if _is_hip: - from aiter import ck_moe + from aiter import ActivationType + from aiter.fused_moe_bf16_asm import ck_moe_2stages + from aiter.ops.shuffle import shuffle_weight logger = logging.getLogger(__name__) @@ -102,14 +104,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): set_weight_attrs(w2_weight, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - if _is_hip and get_bool_env_var("CK_MOE"): + if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"): layer.w13_weight = torch.nn.Parameter( - permute_weight(layer.w13_weight.data), + shuffle_weight(layer.w13_weight.data, (16, 16)), requires_grad=False, ) torch.cuda.empty_cache() layer.w2_weight = torch.nn.Parameter( - permute_weight(layer.w2_weight.data), + shuffle_weight(layer.w2_weight.data, (16, 16)), requires_grad=False, ) torch.cuda.empty_cache() @@ -182,21 +184,17 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): routed_scaling_factor=routed_scaling_factor, ) - if _is_hip and get_bool_env_var("CK_MOE"): + if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"): assert not no_combine, "unsupported" - return ck_moe( + return ck_moe_2stages( x, layer.w13_weight, layer.w2_weight, topk_weights, topk_ids, - None, - None, - None, - None, - 32, - None, - activation, + activation=( + ActivationType.Silu if activation == "silu" else ActivationType.Gelu + ), ) else: return fused_experts( @@ -527,7 +525,7 @@ class FusedMoE(torch.nn.Module): # Case input scale: input_scale loading is only supported for fp8 if "input_scale" in weight_name: # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust input_scale for e4m3fnuz (AMD) - if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"): + if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"): loaded_weight = loaded_weight * 2.0 # this is needed for compressed-tensors only @@ -569,7 +567,7 @@ class FusedMoE(torch.nn.Module): quant_method = getattr(param, "quant_method", None) if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value: # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust INT4 column-wise scaling number to e4m3fnuz (AMD) - if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"): + if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"): loaded_weight = loaded_weight * 0.5 self._load_per_channel_weight_scale( @@ -592,7 +590,7 @@ class FusedMoE(torch.nn.Module): ) elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust FP8 per-tensor scaling number for e4m3fnuz (AMD) - if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"): + if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"): loaded_weight = loaded_weight * 2.0 self._load_per_tensor_weight_scale( diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index b37a86887..2a5de2f21 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -72,8 +72,8 @@ _is_hip = is_hip() _is_cuda = is_cuda() if _is_hip: - from aiter import ActivationType - from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4 + from aiter import ActivationType, QuantType + from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages from aiter.ops.shuffle import shuffle_weight if not _is_cuda: @@ -484,7 +484,7 @@ class Fp8MoEMethod: if self.quant_config.is_checkpoint_fp8_serialized: params_dtype = ( torch.uint32 - if get_bool_env_var("USE_INT4_WEIGHT") + if get_bool_env_var("SGLANG_INT4_WEIGHT") else torch.float8_e4m3fn ) tp_size = get_tensor_model_parallel_world_size() @@ -511,7 +511,7 @@ class Fp8MoEMethod: ) # WEIGHTS - if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"): + if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"): # INT4 MoE weight - INT32 packed w13_weight = torch.nn.Parameter( torch.empty( @@ -585,7 +585,7 @@ class Fp8MoEMethod: if ( _is_hip - ): # and get_bool_env_var("CK_MOE"): TODO: add check back after triton kernel + ): # and get_bool_env_var("SGLANG_AITER_MOE"): TODO: add check back after triton kernel # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling w13_weight_scale1 = torch.nn.Parameter( torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32), @@ -612,7 +612,7 @@ class Fp8MoEMethod: set_weight_attrs(w13_weight_scale, extra_weight_attrs) set_weight_attrs(w2_weight_scale, extra_weight_attrs) - if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"): + if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"): extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -644,7 +644,7 @@ class Fp8MoEMethod: layer.w2_input_scale = None def process_weights_after_loading(self, layer: Module) -> None: - if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"): + if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"): self.process_weights_hip_int4(layer) return @@ -675,7 +675,7 @@ class Fp8MoEMethod: ) layer.w2_input_scale = None - if get_bool_env_var("CK_MOE"): + if get_bool_env_var("SGLANG_AITER_MOE"): # Pre-shuffle weights layer.w13_weight.data = shuffle_weight( layer.w13_weight.contiguous(), (16, 16) @@ -798,17 +798,15 @@ class Fp8MoEMethod: return def process_weights_hip_int4(self, layer: Module): - # TODO: and get_bool_env_var("CK_MOE"): add after triton kernel added + # TODO: and get_bool_env_var("SGLANG_AITER_MOE"): add after triton kernel added # INT4-FP8 (INT4 MoE Weight, FP8 Compute) # Weight Permutation layer.w13_weight = torch.nn.Parameter( - # permute_weight(layer.w13_weight.data), shuffle_weight(layer.w13_weight.data, (16, 16)), requires_grad=False, ) torch.cuda.empty_cache() layer.w2_weight = torch.nn.Parameter( - # permute_weight(layer.w2_weight.data), shuffle_weight(layer.w2_weight.data, (16, 16)), requires_grad=False, ) @@ -847,23 +845,21 @@ class Fp8MoEMethod: padding_size, # Avoid circular import ) - if get_bool_env_var("CK_MOE"): + if get_bool_env_var("SGLANG_AITER_MOE"): layer.w13_weight = torch.nn.Parameter( - # permute_weight(layer.w13_weight.data), shuffle_weight(layer.w13_weight.data, (16, 16)), requires_grad=False, ) torch.cuda.empty_cache() layer.w2_weight = torch.nn.Parameter( - # permute_weight(layer.w2_weight.data), shuffle_weight(layer.w2_weight.data, (16, 16)), requires_grad=False, ) torch.cuda.empty_cache() - # ROCm (CK_MOE): using column-wise scaling + # ROCm (SGLANG_AITER_MOE): using column-wise scaling layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1) layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1) - elif get_bool_env_var("MOE_PADDING"): + elif get_bool_env_var("SGLANG_MOE_PADDING"): # If ROCm, apply weight padding (min. Mem channel contention) only if set layer.w13_weight = torch.nn.Parameter( F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0), @@ -912,15 +908,16 @@ class Fp8MoEMethod: ) if _is_hip: - if get_bool_env_var("USE_INT4_WEIGHT"): - # TODO: add triton kernel and add check get_bool_env_var("CK_MOE") + if get_bool_env_var("SGLANG_INT4_WEIGHT"): + # TODO: add triton kernel and add check get_bool_env_var("SGLANG_AITER_MOE") assert not no_combine, f"{no_combine=} is not supported." - return ck_moe_2stages_win4( + return ck_moe_2stages( x, layer.w13_weight, layer.w2_weight, topk_weights, topk_ids, + QuantType.per_Token, layer.w13_weight_scale1, layer.w2_weight_scale1, activation=( @@ -930,13 +927,13 @@ class Fp8MoEMethod: ), ) - if get_bool_env_var("CK_MOE"): + if get_bool_env_var("SGLANG_AITER_MOE"): assert not no_combine, f"{no_combine=} is not supported." if self.block_quant: - # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being. + # TODO(SGLANG_AITER_MOE): FP8 block_quant only supports 'silu' for the time-being. assert ( activation == "silu" - ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE" + ), f"SGLANG_AITER_MOE: FP8 bloack_quant {activation=} will be supported later, unset SGLANG_AITER_MOE" return asm_moe( x, layer.w13_weight, @@ -955,6 +952,7 @@ class Fp8MoEMethod: layer.w2_weight, topk_weights, topk_ids, + QuantType.per_Token, layer.w13_weight_scale1, layer.w2_weight_scale1, activation=( diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 7948cff16..c137d3ad1 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -31,7 +31,7 @@ from sglang.srt.utils import ( _is_hip = is_hip() _is_cuda = is_cuda() -if _is_hip and get_bool_env_var("CK_MOE"): +if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"): from aiter import gemm_a8w8_blockscale if _is_cuda: @@ -132,7 +132,7 @@ def apply_w8a8_block_fp8_linear( output = fp8_blockwise_scaled_mm( q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype ) - elif _is_hip and get_bool_env_var("CK_MOE"): + elif _is_hip and get_bool_env_var("SGLANG_AITER_MOE"): q_input, x_scale = per_token_group_quant_fp8( input_2d, block_size[1], column_major_scales=False )