From 4b9b337b396eb4d76f45d6e95ef4588d4cd84402 Mon Sep 17 00:00:00 2001 From: maxiao1 Date: Wed, 29 Oct 2025 09:06:22 +0800 Subject: [PATCH] =?UTF-8?q?=E9=80=82=E9=85=8Dw8a8=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/sglang/srt/configs/model_config.py | 1 + .../moe/fused_moe_triton/fused_moe_triton_kernels.py | 3 ++- python/sglang/srt/layers/quantization/w8a8_int8.py | 7 +++++-- python/sglang/srt/model_executor/model_runner.py | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 3985d0350..36ec72568 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -615,6 +615,7 @@ class ModelConfig: "quark", "mxfp4", "slimquant_w4a8_marlin", + "w8a8_int8", ] optimized_quantization_methods = [ "fp8", diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py index 6a7229a9b..9a62ac22b 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py @@ -14,9 +14,10 @@ from sglang.srt.layers.quantization.fp8_kernel import ( ) from sglang.srt.layers.quantization.int8_kernel import ( per_token_group_quant_int8, - per_token_quant_int8, + # per_token_quant_int8, sglang_per_token_group_quant_int8, ) +from lmslim.layers.gemm.int8_utils import per_token_quant_int8 from sglang.srt.utils import ( cpu_has_amx_support, get_bool_env_var, diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py index 5ceba2f67..af797540d 100644 --- a/python/sglang/srt/layers/quantization/w8a8_int8.py +++ b/python/sglang/srt/layers/quantization/w8a8_int8.py @@ -22,7 +22,8 @@ from sglang.srt.layers.quantization.base_config import ( QuantizeMethodBase, ) from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer -from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 +# from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8 +from lmslim.layers.gemm.int8_utils import per_token_quant_int8 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod from sglang.srt.utils import ( apply_module_patch, @@ -39,6 +40,8 @@ if TYPE_CHECKING: CombineInput, StandardDispatchOutput, ) +from lmslim import quant_ops + _is_cuda = is_cuda() _is_cpu_amx_available = cpu_has_amx_support() @@ -405,7 +408,7 @@ class W8A8Int8LinearMethod(LinearMethodBase): x_scale_2d = x_scale.view(-1, x_scale.shape[-1]) output_shape = [*x_q.shape[:-1], layer.weight.shape[1]] - output = int8_scaled_mm( + output = quant_ops.triton_scaled_mm( x_q_2d, layer.weight, x_scale_2d, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 2f63b611d..601d36387 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -203,7 +203,7 @@ _is_xpu_xmx_available = xpu_has_xmx_support() SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None) # Detect stragger ranks in model loading -UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300 +UNBALANCED_MODEL_LOADING_TIMEOUT_S = 36000 # the ratio of mamba cache pool size to max_running_requests, it will be safe when it is larger than 2 (yizhang2077) MAMBA_CACHE_SIZE_MAX_RUNNING_REQUESTS_RATIO = 3