From 4b9b337b396eb4d76f45d6e95ef4588d4cd84402 Mon Sep 17 00:00:00 2001
From: maxiao1 <maxiao1@sugon.com>
Date: Wed, 29 Oct 2025 09:06:22 +0800
Subject: [PATCH] =?UTF-8?q?=E9=80=82=E9=85=8Dw8a8=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/sglang/srt/configs/model_config.py                  | 1 +
 .../moe/fused_moe_triton/fused_moe_triton_kernels.py       | 3 ++-
 python/sglang/srt/layers/quantization/w8a8_int8.py         | 7 +++++--
 python/sglang/srt/model_executor/model_runner.py           | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 3985d0350..36ec72568 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -615,6 +615,7 @@ class ModelConfig:
             "quark",
             "mxfp4",
             "slimquant_w4a8_marlin",
+            "w8a8_int8",
         ]
         optimized_quantization_methods = [
             "fp8",
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
index 6a7229a9b..9a62ac22b 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
@@ -14,9 +14,10 @@ from sglang.srt.layers.quantization.fp8_kernel import (
 )
 from sglang.srt.layers.quantization.int8_kernel import (
     per_token_group_quant_int8,
-    per_token_quant_int8,
+    # per_token_quant_int8,
     sglang_per_token_group_quant_int8,
 )
+from lmslim.layers.gemm.int8_utils import per_token_quant_int8
 from sglang.srt.utils import (
     cpu_has_amx_support,
     get_bool_env_var,
diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py
index 5ceba2f67..af797540d 100644
--- a/python/sglang/srt/layers/quantization/w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -22,7 +22,8 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
-from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+# from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from lmslim.layers.gemm.int8_utils import per_token_quant_int8
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.utils import (
     apply_module_patch,
@@ -39,6 +40,8 @@ if TYPE_CHECKING:
         CombineInput,
         StandardDispatchOutput,
     )
+from lmslim import quant_ops
+
 
 _is_cuda = is_cuda()
 _is_cpu_amx_available = cpu_has_amx_support()
@@ -405,7 +408,7 @@ class W8A8Int8LinearMethod(LinearMethodBase):
         x_scale_2d = x_scale.view(-1, x_scale.shape[-1])
         output_shape = [*x_q.shape[:-1], layer.weight.shape[1]]
 
-        output = int8_scaled_mm(
+        output = quant_ops.triton_scaled_mm(
             x_q_2d,
             layer.weight,
             x_scale_2d,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 2f63b611d..601d36387 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -203,7 +203,7 @@ _is_xpu_xmx_available = xpu_has_xmx_support()
 SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
 
 # Detect stragger ranks in model loading
-UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
+UNBALANCED_MODEL_LOADING_TIMEOUT_S = 36000
 
 # the ratio of mamba cache pool size to max_running_requests, it will be safe when it is larger than 2 (yizhang2077)
 MAMBA_CACHE_SIZE_MAX_RUNNING_REQUESTS_RATIO = 3