Clean up import vllm in quantization/__init__.py (#4834)

2025-03-28 10:34:10 -07:00
parent ef9a378a20
commit 74e0ac1dbd
14 changed files with 191 additions and 254 deletions
--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -45,7 +45,7 @@ class TestEAGLEEngine(CustomTestCase):
        "mem_fraction_static": 0.7,
        "cuda_graph_max_bs": 4,
    }
-    NUM_CONFIGS = 3
+    NUM_CONFIGS = 2

    def setUp(self):
        self.prompt = "Today is a sunny day and I like"
@@ -61,8 +61,6 @@ class TestEAGLEEngine(CustomTestCase):
        configs = [
            # Basic config
            self.BASE_CONFIG,
-            # Disable cuda graph
-            {**self.BASE_CONFIG, "disable_cuda_graph": True},
            # Chunked prefill
            {**self.BASE_CONFIG, "chunked_prefill_size": 4},
        ]
--- a/test/srt/test_triton_attention_backend.py
+++ b/test/srt/test_triton_attention_backend.py
@@ -28,7 +28,7 @@ class TestTritonAttnBackend(CustomTestCase):
                "triton",
                "--enable-torch-compile",
                "--cuda-graph-max-bs",
-                16,
+                4,
            ],
        )