[1/n] chore: decouple quantization implementation from vLLM dependency (#7992)

This commit is contained in:
Peng Zhang
2025-07-17 06:56:26 +08:00
committed by GitHub
parent 570d33437b
commit c28ad1990d
13 changed files with 1498 additions and 636 deletions

View File

@@ -51,13 +51,12 @@ def check_quant_method(model_path: str, use_marlin_kernel: bool):
model_config=model_config, load_config=load_config, device_config=device_config
)
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
from sglang.srt.layers.linear import UnquantizedLinearMethod
from sglang.srt.layers.quantization.gptq import (
GPTQLinearMethod,
GPTQMarlinLinearMethod,
)
from sglang.srt.layers.linear import UnquantizedLinearMethod
linear_method_cls = (
GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
)
@@ -162,7 +161,7 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--dtype", "float16"],
other_args=["--dtype", "bfloat16"],
)
@classmethod