Fix linear.py and improve weight loading (#2851)

Co-authored-by: SangBin Cho <rkooo567@gmail.com>
2025-01-13 01:39:14 -08:00
parent 4093aa4660
commit 72c7776355
12 changed files with 113 additions and 125 deletions
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -1,8 +1,8 @@
 from typing import List, Optional, Tuple

 import torch
-from vllm.model_executor.parameter import RowvLLMParameter, _ColumnvLLMParameter

+from sglang.srt.layers.parameter import RowvLLMParameter, _ColumnvLLMParameter
 from sglang.srt.layers.quantization.fp8_kernel import (
    per_token_group_quant_fp8,
    w8a8_block_fp8_matmul,
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -11,9 +11,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    cutlass_fp8_supported,
    requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter

 from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
    QuantizationConfig,
    QuantizeMethodBase,