[1/n] chore: decouple quantization implementation from vLLM dependency (#7992)
This commit is contained in:
@@ -51,13 +51,12 @@ def check_quant_method(model_path: str, use_marlin_kernel: bool):
|
||||
model_config=model_config, load_config=load_config, device_config=device_config
|
||||
)
|
||||
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||
from sglang.srt.layers.linear import UnquantizedLinearMethod
|
||||
from sglang.srt.layers.quantization.gptq import (
|
||||
GPTQLinearMethod,
|
||||
GPTQMarlinLinearMethod,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.linear import UnquantizedLinearMethod
|
||||
|
||||
linear_method_cls = (
|
||||
GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
|
||||
)
|
||||
@@ -162,7 +161,7 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=["--dtype", "float16"],
|
||||
other_args=["--dtype", "bfloat16"],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user