diff --git a/python/sglang/srt/layers/quantization/gptq.py b/python/sglang/srt/layers/quantization/gptq.py index 2e3e8c89c..710fe5a2e 100644 --- a/python/sglang/srt/layers/quantization/gptq.py +++ b/python/sglang/srt/layers/quantization/gptq.py @@ -6,7 +6,6 @@ import torch from sglang.srt.layers.linear import LinearBase from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead from sglang.srt.utils import is_cuda _is_cuda = is_cuda() @@ -434,6 +433,9 @@ class MarlinConfig(QuantizationConfig): from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod + # Delay import to avoid circular dependency + from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead + if isinstance(layer, LinearBase) or ( isinstance(layer, ParallelLMHead) and self.lm_head_quantized ): diff --git a/python/sglang/test/__init__.py b/python/sglang/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/sglang/test/attention/__init__.py b/python/sglang/test/attention/__init__.py new file mode 100644 index 000000000..e69de29bb