diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 767d2af47..efcc3a3a4 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -47,7 +47,7 @@ jobs:
           bash scripts/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 30
+        timeout-minutes: 25
         run: |
           cd test/srt
           python3 run_suite.py --suite minimal --range-begin 0 --range-end 5
diff --git a/python/pyproject.toml b/python/pyproject.toml
index b5fa4ceea..5e144f809 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -20,7 +20,7 @@ runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hu
     "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
     "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
     "outlines>=0.0.44,<0.1.0", "modelscope"]
-srt = ["sglang[runtime_common]", "torch", "vllm==0.6.4.post1"]
+srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py
index d31dc81ed..94d48e82b 100644
--- a/python/sglang/srt/layers/activation.py
+++ b/python/sglang/srt/layers/activation.py
@@ -38,7 +38,6 @@ from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)
 
 
-@CustomOp.register("silu_and_mul")
 class SiluAndMul(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -52,7 +51,6 @@ class SiluAndMul(CustomOp):
         return out
 
 
-@CustomOp.register("gelu_and_mul")
 class GeluAndMul(CustomOp):
     def __init__(self, approximate="tanh"):
         super().__init__()
diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
index 3ffa91575..3ae392eb9 100644
--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -36,7 +36,6 @@ from vllm.model_executor.custom_op import CustomOp
 logger = logging.getLogger(__name__)
 
 
-@CustomOp.register("rmsnorm")
 class RMSNorm(CustomOp):
     def __init__(
         self,
@@ -79,7 +78,6 @@ class RMSNorm(CustomOp):
             return x, residual
 
 
-@CustomOp.register("gemma_rmsnorm")
 class GemmaRMSNorm(CustomOp):
     def __init__(
         self,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index e36c6028f..5cde1e942 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -28,7 +28,6 @@ import torch
 import torch.nn as nn
 from vllm.config import DeviceConfig, LoadConfig
 from vllm.config import ModelConfig as VllmModelConfig
-from vllm.config import VllmConfig
 from vllm.distributed import (
     get_tp_group,
     init_distributed_environment,
@@ -60,7 +59,6 @@ from sglang.srt.utils import (
     enable_show_time_cost,
     get_available_gpu_memory,
     monkey_patch_vllm_dummy_weight_loader,
-    monkey_patch_vllm_model_config,
     monkey_patch_vllm_p2p_access_check,
 )
 
@@ -245,14 +243,12 @@ class ModelRunner:
 
         # Prepare the vllm model config
         monkey_patch_vllm_dummy_weight_loader()
-        monkey_patch_vllm_model_config()
         self.load_config = LoadConfig(
             load_format=self.server_args.load_format,
             download_dir=self.server_args.download_dir,
         )
         self.vllm_model_config = VllmModelConfig(
             model=self.server_args.model_path,
-            task="generate" if self.model_config.is_generation else "embedding",
             quantization=self.server_args.quantization,
             tokenizer=None,
             tokenizer_mode=None,
@@ -267,17 +263,15 @@ class ModelRunner:
             )
         self.dtype = self.vllm_model_config.dtype
 
-        self.vllm_config = VllmConfig()
-        self.vllm_config.model_config = self.vllm_model_config
-        self.vllm_config.load_config = self.load_config
-        self.vllm_config.device_config = DeviceConfig(self.device)
-        self.vllm_config.quant_config = VllmConfig._get_quantization_config(
-            self.vllm_config.model_config, self.vllm_config.load_config
-        )
-
         # Load the model
         self.model = get_model(
-            vllm_config=self.vllm_config,
+            model_config=self.vllm_model_config,
+            load_config=self.load_config,
+            device_config=DeviceConfig(self.device),
+            parallel_config=None,
+            scheduler_config=None,
+            lora_config=None,
+            cache_config=None,
         )
         self.sliding_window_size = (
             self.model.get_attention_sliding_window_size()
@@ -312,7 +306,6 @@ class ModelRunner:
             # TODO: Use a better method to check this
             vllm_model_config = VllmModelConfig(
                 model=model_path,
-                task="generate" if self.model_config.is_generation else "embedding",
                 quantization=self.server_args.quantization,
                 tokenizer=None,
                 tokenizer_mode=None,
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 994da0458..32317ec2e 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -410,23 +410,37 @@ def monkey_patch_vllm_dummy_weight_loader():
     Monkey patch the dummy weight loader in vllm to call process_weights_after_loading.
     """
 
-    from vllm.config import VllmConfig
     from vllm.model_executor.model_loader.loader import (
+        CacheConfig,
+        DeviceConfig,
         DummyModelLoader,
+        LoRAConfig,
+        ModelConfig,
+        ParallelConfig,
+        SchedulerConfig,
         _initialize_model,
         initialize_dummy_weights,
         nn,
         set_default_torch_dtype,
     )
 
-    def load_model(self, *, vllm_config: VllmConfig) -> nn.Module:
-        with set_default_torch_dtype(vllm_config.model_config.dtype):
-            with torch.device(vllm_config.device_config.device):
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> nn.Module:
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
                 model = _initialize_model(
-                    vllm_config.model_config,
+                    model_config,
                     self.load_config,
-                    vllm_config.lora_config,
-                    vllm_config.cache_config,
+                    lora_config,
+                    cache_config,
                 )
 
             for _, module in model.named_modules():
@@ -498,60 +512,6 @@ def maybe_set_triton_cache_manager() -> None:
         os.environ["TRITON_CACHE_MANAGER"] = manager
 
 
-def monkey_patch_vllm_model_config():
-    from typing import Dict, Set, Tuple, Union
-
-    from transformers import PretrainedConfig
-    from vllm.config import ModelConfig, TaskOption, _Task
-
-    def _resolve_task(
-        self,
-        task_option: Union[TaskOption, _Task],
-        hf_config: PretrainedConfig,
-    ) -> Tuple[Set[_Task], _Task]:
-
-        architectures = getattr(hf_config, "architectures", [])
-        if isinstance(architectures, str):
-            architectures = [architectures]
-
-        non_generation_models = {
-            "LlamaEmbeddingModel",
-            "MistralModel",
-            "LlamaForSequenceClassification",
-            "LlamaForSequenceClassificationWithNormal_Weights",
-            "InternLM2ForRewardModel",
-        }
-
-        is_generation = not any(arch in non_generation_models for arch in architectures)
-
-        auto_map = getattr(hf_config, "auto_map", {})
-        has_sequence_classification = any(
-            "ForSequenceClassification" in v for v in auto_map.values()
-        )
-
-        task_support: Dict[_Task, bool] = {
-            "generate": is_generation,
-            "embedding": (not is_generation) or has_sequence_classification,
-        }
-
-        supported_tasks_lst = [
-            task for task, is_supported in task_support.items() if is_supported
-        ]
-        supported_tasks = set(supported_tasks_lst)
-
-        if task_option not in supported_tasks:
-            msg = (
-                f"This model does not support the '{task_option}' task. "
-                f"Supported tasks: {supported_tasks}"
-            )
-            raise ValueError(msg)
-        selected_task = task_option
-
-        return supported_tasks, selected_task
-
-    setattr(ModelConfig, "_resolve_task", _resolve_task)
-
-
 class CustomCacheManager(FileCacheManager):
     # Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
     def __init__(self, key, override=False, dump=False):
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 96b6c0380..6955d4917 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -1,4 +1,3 @@
-import sys
 import unittest
 
 from sglang.test.test_utils import (
@@ -36,12 +35,7 @@ class TestBenchServing(unittest.TestCase):
         )
 
         if is_in_ci():
-            print(
-                f"Output throughput: {res['output_throughput']}, Is greater than 1000: {res['output_throughput'] > 1000}",
-                file=sys.stderr,
-            )
-            # TODO(zhyncs) fix this
-            # assert res["output_throughput"] > 1000
+            assert res["output_throughput"] > 1000
 
     def test_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py
index ede25b1d4..49ef46169 100644
--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -1,7 +1,4 @@
-import json
-import os
 import unittest
-from datetime import datetime
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_child_process
@@ -17,26 +14,6 @@ from sglang.test.test_utils import (
     popen_launch_server,
 )
 
-MODEL_SCORE_THRESHOLDS = {
-    "meta-llama/Llama-3.1-8B-Instruct": 0.8316,
-    "mistralai/Mistral-7B-Instruct-v0.3": 0.5861,
-    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672,
-    "google/gemma-2-27b-it": 0.9227,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.9623,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.8791,
-    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672,
-    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356,
-    "neuralmagic/gemma-2-2b-it-FP8": 0.6059,
-    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504,
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138,
-    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504,
-    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435,
-}
-
 
 def parse_models(model_string):
     return [model.strip() for model in model_string.split(",") if model.strip()]
@@ -46,8 +23,10 @@ def launch_server(base_url, model, is_fp8, is_tp2):
     other_args = ["--log-level-http", "warning", "--trust-remote-code"]
     if is_fp8:
         if "Llama-3" in model or "gemma-2" in model:
+            # compressed-tensors
             other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
         elif "Qwen2-72B-Instruct-FP8" in model:
+            # bug
             other_args.extend(["--quantization", "fp8"])
         else:
             other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"])
@@ -69,49 +48,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
     return process
 
 
-def write_results_to_json(model, metrics, mode="a"):
-    result = {
-        "timestamp": datetime.now().isoformat(),
-        "model": model,
-        "metrics": metrics,
-        "score": metrics["score"],
-    }
-
-    existing_results = []
-    if mode == "a" and os.path.exists("results.json"):
-        try:
-            with open("results.json", "r") as f:
-                existing_results = json.load(f)
-        except json.JSONDecodeError:
-            existing_results = []
-
-    if isinstance(existing_results, list):
-        existing_results.append(result)
-    else:
-        existing_results = [result]
-
-    with open("results.json", "w") as f:
-        json.dump(existing_results, f, indent=2)
-
-
-def check_model_scores(results):
-    failed_models = []
-    for model, score in results:
-        threshold = MODEL_SCORE_THRESHOLDS.get(model)
-        if threshold is None:
-            print(f"Warning: No threshold defined for model {model}")
-            continue
-
-        if score < threshold:
-            failed_models.append(
-                f"\nScore Check Failed: {model}\n"
-                f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
-            )
-
-    if failed_models:
-        raise AssertionError("\n".join(failed_models))
-
-
 class TestEvalAccuracyLarge(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -132,9 +68,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
             kill_child_process(self.process.pid, include_self=True)
 
     def test_mgsm_en_all_models(self):
-        is_first = True
-        all_results = []
-
         for model_group, is_fp8, is_tp2 in self.model_groups:
             for model in model_group:
                 with self.subTest(model=model):
@@ -152,24 +85,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
                     print(
                         f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
                     )
-
-                    write_results_to_json(model, metrics, "w" if is_first else "a")
-                    is_first = False
-
-                    all_results.append((model, metrics["score"]))
+                    # loosely threshold
+                    assert metrics["score"] > 0.5, f"score={metrics['score']} <= 0.5"
 
                     self.tearDown()
 
-        try:
-            with open("results.json", "r") as f:
-                print("\nFinal Results from results.json:")
-                print(json.dumps(json.load(f), indent=2))
-        except Exception as e:
-            print(f"Error reading results.json: {e}")
-
-        # Check all scores after collecting all results
-        check_model_scores(all_results)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index bd1741b16..ddb92a57f 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -66,7 +66,7 @@ class TestTorchCompile(unittest.TestCase):
         print(res["text"])
         throughput = max_tokens / (tok - tic)
         print(f"Throughput: {throughput} tokens/s")
-        self.assertGreaterEqual(throughput, 151)
+        self.assertGreaterEqual(throughput, 152)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py
index a82b61e41..934ef3499 100644
--- a/test/srt/test_torch_compile_moe.py
+++ b/test/srt/test_torch_compile_moe.py
@@ -66,7 +66,7 @@ class TestTorchCompile(unittest.TestCase):
         print(f"{res=}")
         throughput = max_tokens / (tok - tic)
         print(f"Throughput: {throughput} tokens/s")
-        self.assertGreaterEqual(throughput, 289)
+        self.assertGreaterEqual(throughput, 290)
 
 
 if __name__ == "__main__":