diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 368c205a2..b4eb6106d 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -54,7 +54,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        part: [0, 1, 2, 3, 4, 5, 6]
+        part: [0, 1, 2, 3, 4, 5, 6, 7]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -64,10 +64,10 @@ jobs:
           bash scripts/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 40
+        timeout-minutes: 30
         run: |
           cd test/srt
-          python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
+          python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
 
   unit-test-backend-2-gpu:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 5290d7a2a..9b2722126 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -977,6 +977,7 @@ async def benchmark(
     profile: bool,
     pd_seperated: bool = False,
     flush_cache: bool = False,
+    warmup_requests: int = 1,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -993,10 +994,8 @@ async def benchmark(
         async with semaphore:
             return await request_func(request_func_input=request_func_input, pbar=pbar)
 
-    if not hasattr(args, "warmup_requests"):
-        args.warmup_requests = 1
     # Warmup
-    print(f"Starting warmup with {args.warmup_requests} sequences...")
+    print(f"Starting warmup with {warmup_requests} sequences...")
 
     # Use the first request for all warmup iterations
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
@@ -1018,7 +1017,7 @@ async def benchmark(
 
     # Run warmup requests
     warmup_tasks = []
-    for _ in range(args.warmup_requests):
+    for _ in range(warmup_requests):
         warmup_tasks.append(
             asyncio.create_task(request_func(request_func_input=test_input))
         )
@@ -1026,9 +1025,7 @@ async def benchmark(
     warmup_outputs = await asyncio.gather(*warmup_tasks)
 
     # Check if at least one warmup request succeeded
-    if args.warmup_requests > 0 and not any(
-        output.success for output in warmup_outputs
-    ):
+    if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
         raise ValueError(
             "Warmup failed - Please make sure benchmark arguments "
             f"are correctly specified. Error: {warmup_outputs[0].error}"
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
index 4754c84d0..7f1cc01fd 100644
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -281,7 +281,9 @@ async def generate_from_file_request(file: UploadFile, request: Request):
     )
 
     try:
-        ret = await _global_state.generate_request(obj, request).__anext__()
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
         return ret
     except ValueError as e:
         logger.error(f"Error: {e}")
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index e57f9ce6b..7e1889f39 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -8,7 +8,6 @@ import random
 import subprocess
 import threading
 import time
-import traceback
 import unittest
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 5c9f3b89a..8da966d84 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -14,7 +14,7 @@ class TestFile:
 suites = {
     "per-commit": [
         TestFile("models/lora/test_lora.py", 76),
-        TestFile("models/lora/test_lora_backend.py", 420),
+        TestFile("models/lora/test_lora_backend.py", 99),
         TestFile("models/lora/test_multi_lora_backend.py", 60),
         TestFile("models/test_embedding_models.py", 35),
         TestFile("models/test_generation_models.py", 103),
@@ -23,30 +23,30 @@ suites = {
         TestFile("models/test_compressed_tensors_models.py", 100),
         TestFile("models/test_reward_models.py", 83),
         TestFile("models/test_gme_qwen_models.py", 45),
-        TestFile("models/test_clip_models.py", 100),
-        TestFile("models/test_vlm_models.py", 100),
+        TestFile("models/test_clip_models.py", 52),
+        TestFile("models/test_vlm_models.py", 581),
         TestFile("test_abort.py", 51),
         TestFile("test_block_int8.py", 22),
-        TestFile("test_chunked_prefill.py", 336),
-        TestFile("test_eagle_infer.py", 500),
+        TestFile("test_chunked_prefill.py", 285),
+        TestFile("test_eagle_infer.py", 584),
         TestFile("test_ebnf_constrained.py"),
-        TestFile("test_fa3.py", 400),
+        TestFile("test_fa3.py", 376),
         TestFile("test_fp8_kernel.py", 8),
-        TestFile("test_embedding_openai_server.py", 36),
+        TestFile("test_embedding_openai_server.py", 141),
         TestFile("test_hidden_states.py", 55),
         TestFile("test_int8_kernel.py", 8),
         TestFile("test_input_embeddings.py", 38),
         TestFile("test_json_constrained.py", 98),
         TestFile("test_large_max_new_tokens.py", 41),
         TestFile("test_metrics.py", 32),
-        TestFile("test_mla.py", 162),
+        TestFile("test_mla.py", 242),
         TestFile("test_mla_deepseek_v3.py", 221),
-        TestFile("test_mla_int8_deepseek_v3.py", 522),
+        TestFile("test_mla_int8_deepseek_v3.py", 674),
         TestFile("test_mla_flashinfer.py", 395),
-        TestFile("test_mla_fp8.py", 93),
+        TestFile("test_mla_fp8.py", 153),
         TestFile("test_no_chunked_prefill.py", 126),
         TestFile("test_no_overlap_scheduler.py", 262),
-        TestFile("test_openai_server.py", 186),
+        TestFile("test_openai_server.py", 149),
         TestFile("test_penalty.py", 41),
         TestFile("test_page_size.py", 60),
         TestFile("test_pytorch_sampling_backend.py", 66),
@@ -57,11 +57,11 @@ suites = {
         TestFile("test_request_length_validation.py", 31),
         TestFile("test_retract_decode.py", 54),
         TestFile("test_server_args.py", 1),
-        TestFile("test_skip_tokenizer_init.py", 72),
+        TestFile("test_skip_tokenizer_init.py", 117),
         TestFile("test_srt_engine.py", 237),
         TestFile("test_srt_endpoint.py", 94),
         TestFile("test_torch_compile.py", 76),
-        TestFile("test_torch_compile_moe.py", 85),
+        TestFile("test_torch_compile_moe.py", 235),
         TestFile("test_torch_native_attention_backend.py", 123),
         TestFile("test_torchao.py", 70),
         TestFile("test_triton_attention_kernels.py", 4),
@@ -69,27 +69,27 @@ suites = {
         TestFile("test_update_weights_from_disk.py", 114),
         TestFile("test_update_weights_from_tensor.py", 48),
         TestFile("test_vertex_endpoint.py", 31),
-        TestFile("test_vision_chunked_prefill.py", 99),
+        TestFile("test_vision_chunked_prefill.py", 119),
         TestFile("test_vlm_accuracy.py", 60),
-        TestFile("test_vision_openai_server.py", 537),
+        TestFile("test_vision_openai_server.py", 637),
         TestFile("test_fim_completion.py", 40),
         TestFile("test_w8a8_quantization.py", 46),
         TestFile("test_eval_fp8_accuracy.py", 303),
         TestFile("test_create_kvindices.py", 2),
-        TestFile("test_hicache.py", 60),
-        TestFile("test_hicache_mla.py", 90),
+        TestFile("test_hicache.py", 116),
+        TestFile("test_hicache_mla.py", 254),
         TestFile("test_fused_moe.py", 30),
         TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
     ],
     "per-commit-2-gpu": [
-        TestFile("models/lora/test_lora_tp.py", 150),
-        TestFile("test_data_parallelism.py", 90),
-        TestFile("test_dp_attention.py", 150),
-        TestFile("test_mla_tp.py", 174),
-        TestFile("test_moe_ep.py", 220),
-        TestFile("test_patch_torch.py", 30),
-        TestFile("test_update_weights_from_distributed.py", 100),
-        TestFile("test_verl_engine.py", 100),
+        TestFile("models/lora/test_lora_tp.py", 116),
+        TestFile("test_data_parallelism.py", 73),
+        TestFile("test_dp_attention.py", 137),
+        TestFile("test_mla_tp.py", 170),
+        TestFile("test_moe_ep.py", 181),
+        TestFile("test_patch_torch.py", 19),
+        TestFile("test_update_weights_from_distributed.py", 103),
+        TestFile("test_verl_engine.py", 64),
     ],
     "per-commit-8-gpu": [
         TestFile("test_local_attn.py", 250),
diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py
index 34c80d450..42415b155 100644
--- a/test/srt/test_torch_compile_moe.py
+++ b/test/srt/test_torch_compile_moe.py
@@ -24,7 +24,7 @@ class TestTorchCompileMoe(CustomTestCase):
             cls.model,
             cls.base_url,
             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--enable-torch-compile", "--torch-compile-max-bs", "8"],
+            other_args=["--enable-torch-compile", "--torch-compile-max-bs", "4"],
         )
 
     @classmethod
diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/test_update_weights_from_distributed.py
index 7352e757a..e558a56e3 100644
--- a/test/srt/test_update_weights_from_distributed.py
+++ b/test/srt/test_update_weights_from_distributed.py
@@ -129,7 +129,7 @@ def init_process_hf(
     hf_instruct_params = []
     hf_base_params = []
 
-    print("get parameter in hf instruct model and base model")
+    print("[hf] get parameter in hf instruct model and base model")
     for parameter_name in checking_parameters:
         hf_instruct_params.append(
             hf_instruct_model.get_parameter(parameter_name)[:truncate_size]
@@ -152,10 +152,12 @@ def init_process_hf(
     param_queue.put(("hf_base_params", hf_base_params))
 
     # Init weight update group for rank 0 (the training engine in RLHF).
-    print(f"rank {rank} world_size: {world_size} init custom process group")
+    port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    init_method = f"tcp://localhost:{port}"
+    print(f"[hf] {rank=} {world_size=} init custom process group. {init_method=}")
     group = init_custom_process_group(
         backend="nccl",
-        init_method="tcp://localhost:65500",
+        init_method=init_method,
         world_size=world_size,
         rank=rank,
         group_name="test_parameter_update_group",
@@ -184,7 +186,7 @@ def init_process_hf(
 
     # Measure the latency of broadcasting/weights update.
     broadcast_time = time_end_broadcast - time_begin_broadcast
-    print(f"rank {rank} broadcast parameter time: {broadcast_time:.3f}s")
+    print(f"[hf] {rank=} {broadcast_time=:.3f}s")
     param_queue.put(("broadcast_time", broadcast_time))
 
     # Delete the huggingface models to free up memory.
@@ -210,17 +212,21 @@ def init_process_sgl(
     torch.cuda.synchronize()
     base_gpu_id = 1 if rank == 1 else 1 + tp_size
     if backend == "Engine":
+        print(f"[sgl] rank {rank} init engine")
         engine = sgl.Engine(
             model_path=model_name,
-            random_seed=42,
             base_gpu_id=base_gpu_id,
             tp_size=tp_size,
+            cuda_graph_max_bs=2,
         )
     else:
         if rank == 1:
             url = DEFAULT_URL_FOR_TEST
         else:
-            url = DEFAULT_URL_FOR_TEST.replace("2157", "2159")
+            host, port = DEFAULT_URL_FOR_TEST.split(":")
+            url = ":".join(host, str(int(port) + 10000))
+
+        print(f"[sgl] rank {rank} init server on url: {url}")
         process = popen_launch_server(
             model_name,
             url,
@@ -230,13 +236,11 @@ def init_process_sgl(
                 str(base_gpu_id),
                 "--tp-size",
                 str(tp_size),
+                "--cuda-graph-max-bs",
+                2,
             ),
         )
     torch.cuda.synchronize()
-    if backend == "Engine":
-        print(f"rank {rank} init engine")
-    else:
-        print(f"rank {rank} init server on url: {url}")
 
     # Get weights of instruct model, i.e. pre-training weights.
     instruct_params = []
@@ -252,11 +256,13 @@ def init_process_sgl(
 
     param_queue.put((f"sgl_dp_{rank}_instruct_params", instruct_params))
 
+    port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+
     # Init weight update group with the training engine.
     if backend == "Engine":
         engine.init_weights_update_group(
             master_address="localhost",
-            master_port="65500",
+            master_port=str(port),
             rank_offset=base_gpu_id,
             world_size=world_size,
             group_name="test_parameter_update_group",
@@ -267,7 +273,7 @@ def init_process_sgl(
             f"{url}/init_weights_update_group",
             json={
                 "master_address": "localhost",
-                "master_port": "65500",
+                "master_port": str(port),
                 "rank_offset": base_gpu_id,
                 "world_size": world_size,
                 "group_name": "test_parameter_update_group",
@@ -311,7 +317,7 @@ def init_process_sgl(
     # Measure the latency of broadcast/weights update.
     update_time = time_end_update - time_begin_update
     print(
-        f"fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s"
+        f"[sgl] fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s"
     )
     param_queue.put((f"update_sgl_dp_{rank}_time", update_time))