[1/n] chore: decouple quantization implementation from vLLM dependency (#7992)

2025-07-17 06:56:26 +08:00
parent 570d33437b
commit c28ad1990d
13 changed files with 1498 additions and 636 deletions
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -51,13 +51,12 @@ def check_quant_method(model_path: str, use_marlin_kernel: bool):
        model_config=model_config, load_config=load_config, device_config=device_config
    )

-    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-    from vllm.model_executor.layers.quantization.gptq_marlin import (
+    from sglang.srt.layers.linear import UnquantizedLinearMethod
+    from sglang.srt.layers.quantization.gptq import (
+        GPTQLinearMethod,
        GPTQMarlinLinearMethod,
    )

-    from sglang.srt.layers.linear import UnquantizedLinearMethod
-
    linear_method_cls = (
        GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
    )
@@ -162,7 +161,7 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--dtype", "float16"],
+            other_args=["--dtype", "bfloat16"],
        )

    @classmethod