diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index c67e25813..67594a7f7 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -4,19 +4,15 @@ on:
   push:
     branches: [ main ]
     paths:
-      - "python/pyproject.toml"
-      - "python/sglang/**"
-      - "test/**"
-      - "docs/**"
+      - "python/**"
       - "scripts/**"
+      - "test/**"
   pull_request:
     branches: [ main ]
     paths:
-      - "python/pyproject.toml"
-      - "python/sglang/**"
-      - "test/**"
-      - "docs/**"
+      - "python/**"
       - "scripts/**"
+      - "test/**"
   workflow_dispatch:
     inputs:
       version:
diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml
index 96a2b362e..27e2176ef 100644
--- a/.github/workflows/vllm-dependency-test.yml
+++ b/.github/workflows/vllm-dependency-test.yml
@@ -4,19 +4,15 @@ on:
   push:
     branches: [ main ]
     paths:
-      - "python/pyproject.toml"
-      - "python/sglang/**"
-      - "test/**"
-      - "docs/**"
+      - "python/**"
       - "scripts/**"
+      - "test/**"
   pull_request:
     branches: [ main ]
     paths:
-      - "python/pyproject.toml"
-      - "python/sglang/**"
-      - "test/**"
-      - "docs/**"
+      - "python/**"
       - "scripts/**"
+      - "test/**"
 
 concurrency:
   group: vllm-dependency-test-${{ github.ref }}
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 8e053efed..621fa65ce 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle
 
 [project.optional-dependencies]
 runtime_common = [
+    "compressed-tensors",
     "datasets",
     "decord",
     "fastapi",
@@ -56,7 +57,12 @@ srt = [
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20250114, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
+srt_hip = [
+    "sglang[runtime_common]",
+    "torch",
+    "vllm==0.6.7.dev2",
+    "outlines==0.1.11"
+]
 
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index 91383238a..8bc8d12b0 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -22,11 +22,7 @@ import torch
 from transformers import PretrainedConfig
 
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
-from sglang.srt.layers.quantization import (
-    BASE_QUANTIZATION_METHODS,
-    QUANTIZATION_METHODS,
-    VLLM_AVAILABLE,
-)
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.utils import get_bool_env_var, is_hip
 
 logger = logging.getLogger(__name__)
@@ -239,12 +235,7 @@ class ModelConfig:
 
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
     def _verify_quantization(self) -> None:
-        # Select supported quantization methods based on vllm availability
-        if VLLM_AVAILABLE:
-            supported_quantization = [*QUANTIZATION_METHODS]
-        else:
-            supported_quantization = [*BASE_QUANTIZATION_METHODS]
-
+        supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = [
             "awq",
             "gptq",
@@ -282,11 +273,7 @@ class ModelConfig:
             quant_method = quant_cfg.get("quant_method", "").lower()
 
             # Detect which checkpoint is it
-            # Only iterate through currently available quantization methods
-            available_methods = (
-                QUANTIZATION_METHODS if VLLM_AVAILABLE else BASE_QUANTIZATION_METHODS
-            )
-            for _, method in available_methods.items():
+            for _, method in QUANTIZATION_METHODS.items():
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization
                 )
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
index c04e06d2e..cfa264a23 100644
--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -17,12 +17,12 @@ from typing import Callable, Optional
 import torch
 import torch.nn.functional as F
 
+from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
 from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
 
 _is_cuda = is_cuda()
 _is_hip = is_hip()
 
-from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
 
 expert_distribution_recorder = ExpertDistributionRecorder()
 
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
index 407437aee..9667ddf4d 100644
--- a/python/sglang/srt/layers/quantization/__init__.py
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -9,12 +9,24 @@ import torch
 
 try:
     from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
-    from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
+    from vllm.model_executor.layers.quantization.awq_marlin import (
+        AWQMarlinConfig,
+        AWQMoEMethod,
+    )
     from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+        CompressedTensorsW8A8Fp8MoEMethod,
+        CompressedTensorsWNA16MoEMethod,
+    )
     from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
     from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
     from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
     from vllm.model_executor.layers.quantization.gguf import GGUFConfig
+    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+    from vllm.model_executor.layers.quantization.gptq_marlin import (
+        GPTQMarlinLinearMethod,
+        GPTQMarlinMoEMethod,
+    )
     from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
         GPTQMarlin24Config,
     )
@@ -22,24 +34,24 @@ try:
     from vllm.model_executor.layers.quantization.qqq import QQQConfig
     from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
 
-    from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
-
     VLLM_AVAILABLE = True
 except ImportError:
     VLLM_AVAILABLE = False
 
     # Define empty classes as placeholders when vllm is not available
     class DummyConfig:
-        pass
+        def override_quantization_method(self, *args, **kwargs):
+            return None
 
     AQLMConfig = AWQMarlinConfig = BitsAndBytesConfig = CompressedTensorsConfig = (
-        DummyConfig
-    )
-    DeepSpeedFPConfig = ExpertsInt8Config = FBGEMMFp8Config = GGUFConfig = (
-        GPTQMarlin24Config
-    ) = DummyConfig
-    MarlinConfig = QQQConfig = Int8TpuConfig = DummyConfig
+        DeepSpeedFPConfig
+    ) = ExpertsInt8Config = FBGEMMFp8Config = GGUFConfig = GPTQMarlin24Config = (
+        MarlinConfig
+    ) = QQQConfig = Int8TpuConfig = DummyConfig
 
+
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
 from sglang.srt.layers.quantization.awq import AWQConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
@@ -47,9 +59,14 @@ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
     CompressedTensorsConfig,
 )
 from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    UnquantizedEmbeddingMethod,
+)
 
 # Base quantization methods that don't depend on vllm
 BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
@@ -61,26 +78,25 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "compressed-tensors": CompressedTensorsConfig,
 }
 
-# Add vllm-dependent methods if available
-QUANTIZATION_METHODS = BASE_QUANTIZATION_METHODS.copy()
-if VLLM_AVAILABLE:
-    VLLM_QUANTIZATION_METHODS = {
-        "aqlm": AQLMConfig,
-        "awq": AWQConfig,
-        "deepspeedfp": DeepSpeedFPConfig,
-        "tpu_int8": Int8TpuConfig,
-        "fbgemm_fp8": FBGEMMFp8Config,
-        "marlin": MarlinConfig,
-        "gguf": GGUFConfig,
-        "gptq_marlin_24": GPTQMarlin24Config,
-        "awq_marlin": AWQMarlinConfig,
-        "bitsandbytes": BitsAndBytesConfig,
-        "qqq": QQQConfig,
-        "experts_int8": ExpertsInt8Config,
-        "gptq_marlin": GPTQMarlinConfig,
-        "gptq": GPTQConfig,
-    }
-    QUANTIZATION_METHODS.update(VLLM_QUANTIZATION_METHODS)
+# VLLM-dependent quantization methods
+VLLM_QUANTIZATION_METHODS = {
+    "aqlm": AQLMConfig,
+    "awq": AWQConfig,
+    "deepspeedfp": DeepSpeedFPConfig,
+    "tpu_int8": Int8TpuConfig,
+    "fbgemm_fp8": FBGEMMFp8Config,
+    "marlin": MarlinConfig,
+    "gguf": GGUFConfig,
+    "gptq_marlin_24": GPTQMarlin24Config,
+    "awq_marlin": AWQMarlinConfig,
+    "bitsandbytes": BitsAndBytesConfig,
+    "qqq": QQQConfig,
+    "experts_int8": ExpertsInt8Config,
+    "gptq_marlin": GPTQMarlinConfig,
+    "gptq": GPTQConfig,
+}
+
+QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS, **VLLM_QUANTIZATION_METHODS}
 
 
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
@@ -89,6 +105,12 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
             f"Invalid quantization method: {quantization}. "
             f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
         )
+    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
+        raise ValueError(
+            f"{quantization} quantization requires some operators from vllm. "
+            "Pleaes install vllm by `pip install vllm==0.7.2`"
+        )
+
     return QUANTIZATION_METHODS[quantization]
 
 
@@ -153,13 +175,6 @@ def get_linear_quant_method(
     prefix: str,
     linear_method_cls: type,
 ):
-
-    from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
-    from sglang.srt.layers.vocab_parallel_embedding import (
-        ParallelLMHead,
-        UnquantizedEmbeddingMethod,
-    )
-
     cloned_config = deepcopy(config)
     parallel_lm_head_quantized = (
         isinstance(layer, ParallelLMHead) and cloned_config.lm_head_quantized
@@ -186,31 +201,17 @@ def get_linear_quant_method(
 
 
 def gptq_get_quant_method(self, layer, prefix):
-    if not VLLM_AVAILABLE:
-        return None
+    if isinstance(layer, FusedMoE):
+        return GPTQMarlinMoEMethod(self)
 
-    try:
-        from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinLinearMethod,
-            GPTQMarlinMoEMethod,
+    if isinstance(self, GPTQConfig):
+        return get_linear_quant_method(
+            self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
+        )
+    elif isinstance(self, GPTQMarlinConfig):
+        return get_linear_quant_method(
+            self, layer, prefix=prefix, linear_method_cls=GPTQMarlinLinearMethod
         )
-
-        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-
-        if isinstance(layer, FusedMoE):
-            return GPTQMarlinMoEMethod(self)
-
-        if isinstance(self, GPTQConfig):
-            return get_linear_quant_method(
-                self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
-            )
-        elif isinstance(self, GPTQMarlinConfig):
-            return get_linear_quant_method(
-                self, layer, prefix=prefix, linear_method_cls=GPTQMarlinLinearMethod
-            )
-    except ImportError:
-        pass
     return None
 
 
@@ -229,33 +230,28 @@ def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
         builtins.isinstance = original_isinstance
         return
 
-    try:
-        from vllm.model_executor.layers.fused_moe import FusedMoE
-        from vllm.model_executor.layers.linear import LinearBase
-        from vllm.model_executor.layers.vocab_parallel_embedding import (
-            VocabParallelEmbedding,
-        )
+    from vllm.model_executor.layers.fused_moe import FusedMoE
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding,
+    )
 
-        from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
-        from sglang.srt.layers.moe.fused_moe_triton.layer import (
-            FusedMoE as PatchedFusedMoE,
-        )
-        from sglang.srt.layers.vocab_parallel_embedding import (
-            VocabParallelEmbedding as PatchedVocabParallelEmbedding,
-        )
+    from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
+    from sglang.srt.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding as PatchedVocabParallelEmbedding,
+    )
 
-        def patched_isinstance(obj, classinfo):
-            if classinfo is LinearBase:
-                return original_isinstance(obj, PatchedLinearBase)
-            if classinfo is FusedMoE:
-                return original_isinstance(obj, PatchedFusedMoE)
-            if classinfo is VocabParallelEmbedding:
-                return original_isinstance(obj, PatchedVocabParallelEmbedding)
-            return original_isinstance(obj, classinfo)
+    def patched_isinstance(obj, classinfo):
+        if classinfo is LinearBase:
+            return original_isinstance(obj, PatchedLinearBase)
+        if classinfo is FusedMoE:
+            return original_isinstance(obj, PatchedFusedMoE)
+        if classinfo is VocabParallelEmbedding:
+            return original_isinstance(obj, PatchedVocabParallelEmbedding)
+        return original_isinstance(obj, classinfo)
 
-        builtins.isinstance = patched_isinstance
-    except ImportError:
-        return
+    builtins.isinstance = patched_isinstance
 
 
 def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
@@ -263,91 +259,64 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
     Monkey patch the apply function of vllm's FusedMoEMethodBase.
     Convert sglang arguments to vllm arguments.
     """
-    if not VLLM_AVAILABLE:
-        return
+    original_apply = class_obj.apply
+    sig = inspect.signature(original_apply)
+    param_names = list(sig.parameters.keys())
+    has_correction_bias = "e_score_correction_bias" in param_names
 
-    try:
-        original_apply = class_obj.apply
-        sig = inspect.signature(original_apply)
-        param_names = list(sig.parameters.keys())
-        has_correction_bias = "e_score_correction_bias" in param_names
+    def new_apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
+    ):
+        assert activation == "silu"
+        assert inplace and not no_combine
 
-        def new_apply(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            router_logits: torch.Tensor,
-            top_k: int,
-            renormalize: bool,
-            use_grouped_topk: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None,
-            correction_bias: Optional[torch.Tensor] = None,
-            activation: str = "silu",
-            inplace: bool = True,
-            no_combine: bool = False,
-        ):
-            assert activation == "silu"
-            assert inplace and not no_combine
+        kwargs = {
+            "self": self,
+            "layer": layer,
+            "x": x,
+            "router_logits": router_logits,
+            "top_k": top_k,
+            "renormalize": renormalize,
+            "use_grouped_topk": use_grouped_topk,
+            "topk_group": topk_group,
+            "num_expert_group": num_expert_group,
+            "custom_routing_function": custom_routing_function,
+        }
+        if correction_bias is not None:
+            if not has_correction_bias:
+                raise ValueError(
+                    "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
+                )
+            kwargs["e_score_correction_bias"] = correction_bias
+        return original_apply(**kwargs)
 
-            kwargs = {
-                "self": self,
-                "layer": layer,
-                "x": x,
-                "router_logits": router_logits,
-                "top_k": top_k,
-                "renormalize": renormalize,
-                "use_grouped_topk": use_grouped_topk,
-                "topk_group": topk_group,
-                "num_expert_group": num_expert_group,
-                "custom_routing_function": custom_routing_function,
-            }
-            if correction_bias is not None:
-                if not has_correction_bias:
-                    raise ValueError(
-                        "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
-                    )
-                kwargs["e_score_correction_bias"] = correction_bias
-            return original_apply(**kwargs)
-
-        setattr(class_obj, "apply", new_apply)
-    except (ImportError, AttributeError):
-        return
+    setattr(class_obj, "apply", new_apply)
 
 
 def monkey_patch_quant_configs():
     """Apply all monkey patches in one place."""
-    if not VLLM_AVAILABLE:
-        return
+    setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
+    setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
 
-    try:
-        from vllm.model_executor.layers.quantization.awq_marlin import AWQMoEMethod
-        from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
-            CompressedTensorsW8A8Fp8MoEMethod,
-            CompressedTensorsWNA16MoEMethod,
-        )
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinMoEMethod,
-        )
-
-        setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
-        setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
-
-        monkey_patch_moe_apply(AWQMoEMethod)
-        monkey_patch_moe_apply(GPTQMarlinMoEMethod)
-        monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
-        monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
-    except ImportError:
-        return
+    monkey_patch_moe_apply(AWQMoEMethod)
+    monkey_patch_moe_apply(GPTQMarlinMoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
 
 
 # Only apply monkey patches if vllm is available
 if VLLM_AVAILABLE:
     monkey_patch_quant_configs()
-
-
-__all__ = [
-    "get_quantization_config",
-    "QUANTIZATION_METHODS",
-]
diff --git a/python/sglang/srt/layers/quantization/awq.py b/python/sglang/srt/layers/quantization/awq.py
index 7a8af13aa..b14807ee4 100644
--- a/python/sglang/srt/layers/quantization/awq.py
+++ b/python/sglang/srt/layers/quantization/awq.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 import torch
 from sgl_kernel import awq_dequantize
diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
index 2043805e7..bfd74474e 100644
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -24,6 +24,7 @@ import triton.language as tl
 
 from sglang.srt.utils import (
     direct_register_custom_op,
+    get_bool_env_var,
     get_device_core_count,
     get_device_name,
     get_device_sm,
@@ -43,7 +44,7 @@ if _is_cuda:
     from sgl_kernel import sgl_per_token_group_quant_fp8, sgl_per_token_quant_fp8
 
     sm_version = get_device_sm()
-    if sm_version >= 90 and int(os.getenv("SGL_ENABLE_JIT_DEEPGEMM", "1")):
+    if sm_version >= 90 and get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
         _enable_jit_deepgemm = True
 
 
diff --git a/python/sglang/srt/layers/quantization/gptq.py b/python/sglang/srt/layers/quantization/gptq.py
index 710fe5a2e..b52c05756 100644
--- a/python/sglang/srt/layers/quantization/gptq.py
+++ b/python/sglang/srt/layers/quantization/gptq.py
@@ -11,12 +11,29 @@ from sglang.srt.utils import is_cuda
 _is_cuda = is_cuda()
 
 try:
-    import vllm
+    from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
+    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+    from vllm.model_executor.layers.quantization.gptq_marlin import (
+        GPTQMarlinLinearMethod,
+        GPTQMarlinMoEMethod,
+    )
+    from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+    from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+        check_marlin_supported,
+    )
+    from vllm.scalar_type import scalar_types
 
     VLLM_AVAILABLE = True
 except ImportError:
     VLLM_AVAILABLE = False
 
+    GPTQLinearMethod = MarlinLinearMethod = QuantizeMethodBase = Any
+
+    class scalar_types:
+        uint4b8 = "uint4b8"
+        uint8b128 = "uint8b128"
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -117,12 +134,8 @@ class GPTQConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["GPTQLinearMethod"]:
-        if not VLLM_AVAILABLE:
-            raise ImportError("vllm is not installed")
-
-        from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-
+    ) -> Optional[GPTQLinearMethod]:
+        # Delay the import to avoid circular dependency
         from sglang.srt.layers.quantization import get_linear_quant_method
 
         return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
@@ -131,16 +144,11 @@ class GPTQConfig(QuantizationConfig):
 class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
-    if VLLM_AVAILABLE:
-        from vllm.scalar_type import scalar_types
-
-        # (num_bits, is_sym) -> quant_type
-        TYPE_MAP = {
-            (4, True): scalar_types.uint4b8,
-            (8, True): scalar_types.uint8b128,
-        }
-    else:
-        raise ImportError("vllm is not installed")
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
 
     def __init__(
         self,
@@ -197,6 +205,7 @@ class GPTQMarlinConfig(QuantizationConfig):
                 "Unsupported quantization config: " f"bits={weight_bits}, sym={is_sym}"
             )
 
+        # (num_bits, is_sym) -> quant_type
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
     def __repr__(self) -> str:
@@ -278,15 +287,8 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        if not VLLM_AVAILABLE:
-            raise ImportError("vllm is not installed")
-
-        from vllm.model_executor.layers.quantization.gptq_marlin import (
-            GPTQMarlinLinearMethod,
-            GPTQMarlinMoEMethod,
-        )
-
+    ) -> Optional[QuantizeMethodBase]:
+        # Delay the import to avoid circular dependency
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         from sglang.srt.layers.quantization import get_linear_quant_method
 
@@ -304,19 +306,12 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
-        if not VLLM_AVAILABLE:
-            return False
-
         quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits")
         group_size = quant_config.get("group_size")
         sym = quant_config.get("sym")
         desc_act = quant_config.get("desc_act")
 
-        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-            check_marlin_supported,
-        )
-
         if not _is_cuda:
             return False
 
@@ -427,13 +422,8 @@ class MarlinConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["MarlinLinearMethod"]:
-        if not VLLM_AVAILABLE:
-            raise ImportError("vllm is not installed")
-
-        from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
-
-        # Delay import to avoid circular dependency
+    ) -> Optional[MarlinLinearMethod]:
+        # Delay the import to avoid circular dependency
         from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
 
         if isinstance(layer, LinearBase) or (
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index 712da3961..174f2e533 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -53,8 +53,6 @@ class TpModelWorker:
         req_to_token_pool: Optional[ReqToTokenPool] = None,
         token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None,
     ):
-        self.worker = self
-
         # Parse args
         self.tp_rank = tp_rank
 
@@ -134,6 +132,9 @@ class TpModelWorker:
         )[0]
         set_random_seed(self.random_seed)
 
+        # A reference make this class has the same member as TpModelWorkerClient
+        self.worker = self
+
     def get_worker_info(self):
         return (
             self.max_total_num_tokens,
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 4b733a67c..2133f5320 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -73,7 +73,7 @@ from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import add_prefix, is_cuda, is_cuda_available, is_hip
+from sglang.srt.utils import add_prefix, is_cuda, is_hip
 
 _is_hip = is_hip()
 _is_cuda = is_cuda()
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
index 20f559076..38197b274 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -1,10 +1,8 @@
 #!/bin/bash
+# Install the dependency in CI.
 set -euxo pipefail
 
-# Install the dependency in CI.
-
-
-# Use repo from environment variable, passed from GitHub Actions
+# Use repo from environment variables, passed from GitHub Actions
 FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python}"
 
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
@@ -17,17 +15,12 @@ pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2
 rm -rf /root/.cache/flashinfer
 # Force reinstall flashinfer and torch_memory_saver
 pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-reinstall --no-deps
+pip install sgl-kernel==0.0.5.post3 --force-reinstall
 
-pip install torch_memory_saver --force-reinstall
-
-pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets
+pip install torch_memory_saver
+pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm
 
 # For compling xgrammar kernels
 pip install cuda-python nvidia-cuda-nvrtc-cu12
 
-# For DeepSeek-VL2
-pip install timm
-
-pip install sgl-kernel==0.0.5.post3 --force-reinstall
-
 pip uninstall vllm -y || true
diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py
index a3ea40e9e..e29b097c8 100644
--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -45,7 +45,7 @@ class TestEAGLEEngine(CustomTestCase):
         "mem_fraction_static": 0.7,
         "cuda_graph_max_bs": 4,
     }
-    NUM_CONFIGS = 3
+    NUM_CONFIGS = 2
 
     def setUp(self):
         self.prompt = "Today is a sunny day and I like"
@@ -61,8 +61,6 @@ class TestEAGLEEngine(CustomTestCase):
         configs = [
             # Basic config
             self.BASE_CONFIG,
-            # Disable cuda graph
-            {**self.BASE_CONFIG, "disable_cuda_graph": True},
             # Chunked prefill
             {**self.BASE_CONFIG, "chunked_prefill_size": 4},
         ]
diff --git a/test/srt/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py
index c4cdf5e5b..829901dab 100644
--- a/test/srt/test_triton_attention_backend.py
+++ b/test/srt/test_triton_attention_backend.py
@@ -28,7 +28,7 @@ class TestTritonAttnBackend(CustomTestCase):
                 "triton",
                 "--enable-torch-compile",
                 "--cuda-graph-max-bs",
-                16,
+                4,
             ],
         )