diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 368c205a2..b4eb6106d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -54,7 +54,7 @@ jobs: strategy: fail-fast: false matrix: - part: [0, 1, 2, 3, 4, 5, 6] + part: [0, 1, 2, 3, 4, 5, 6, 7] steps: - name: Checkout code uses: actions/checkout@v4 @@ -64,10 +64,10 @@ jobs: bash scripts/ci_install_dependency.sh - name: Run test - timeout-minutes: 40 + timeout-minutes: 30 run: | cd test/srt - python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 7 + python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 8 unit-test-backend-2-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 5290d7a2a..9b2722126 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -977,6 +977,7 @@ async def benchmark( profile: bool, pd_seperated: bool = False, flush_cache: bool = False, + warmup_requests: int = 1, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -993,10 +994,8 @@ async def benchmark( async with semaphore: return await request_func(request_func_input=request_func_input, pbar=pbar) - if not hasattr(args, "warmup_requests"): - args.warmup_requests = 1 # Warmup - print(f"Starting warmup with {args.warmup_requests} sequences...") + print(f"Starting warmup with {warmup_requests} sequences...") # Use the first request for all warmup iterations test_prompt, test_prompt_len, test_output_len = input_requests[0] @@ -1018,7 +1017,7 @@ async def benchmark( # Run warmup requests warmup_tasks = [] - for _ in range(args.warmup_requests): + for _ in range(warmup_requests): warmup_tasks.append( asyncio.create_task(request_func(request_func_input=test_input)) ) @@ -1026,9 +1025,7 @@ async def benchmark( warmup_outputs = await asyncio.gather(*warmup_tasks) # Check if at least one warmup request succeeded - if args.warmup_requests > 0 and not any( - output.success for output in warmup_outputs - ): + if warmup_requests > 0 and not any(output.success for output in warmup_outputs): raise ValueError( "Warmup failed - Please make sure benchmark arguments " f"are correctly specified. Error: {warmup_outputs[0].error}" diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index 4754c84d0..7f1cc01fd 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -281,7 +281,9 @@ async def generate_from_file_request(file: UploadFile, request: Request): ) try: - ret = await _global_state.generate_request(obj, request).__anext__() + ret = await _global_state.tokenizer_manager.generate_request( + obj, request + ).__anext__() return ret except ValueError as e: logger.error(f"Error: {e}") diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index e57f9ce6b..7e1889f39 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -8,7 +8,6 @@ import random import subprocess import threading import time -import traceback import unittest from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 5c9f3b89a..8da966d84 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -14,7 +14,7 @@ class TestFile: suites = { "per-commit": [ TestFile("models/lora/test_lora.py", 76), - TestFile("models/lora/test_lora_backend.py", 420), + TestFile("models/lora/test_lora_backend.py", 99), TestFile("models/lora/test_multi_lora_backend.py", 60), TestFile("models/test_embedding_models.py", 35), TestFile("models/test_generation_models.py", 103), @@ -23,30 +23,30 @@ suites = { TestFile("models/test_compressed_tensors_models.py", 100), TestFile("models/test_reward_models.py", 83), TestFile("models/test_gme_qwen_models.py", 45), - TestFile("models/test_clip_models.py", 100), - TestFile("models/test_vlm_models.py", 100), + TestFile("models/test_clip_models.py", 52), + TestFile("models/test_vlm_models.py", 581), TestFile("test_abort.py", 51), TestFile("test_block_int8.py", 22), - TestFile("test_chunked_prefill.py", 336), - TestFile("test_eagle_infer.py", 500), + TestFile("test_chunked_prefill.py", 285), + TestFile("test_eagle_infer.py", 584), TestFile("test_ebnf_constrained.py"), - TestFile("test_fa3.py", 400), + TestFile("test_fa3.py", 376), TestFile("test_fp8_kernel.py", 8), - TestFile("test_embedding_openai_server.py", 36), + TestFile("test_embedding_openai_server.py", 141), TestFile("test_hidden_states.py", 55), TestFile("test_int8_kernel.py", 8), TestFile("test_input_embeddings.py", 38), TestFile("test_json_constrained.py", 98), TestFile("test_large_max_new_tokens.py", 41), TestFile("test_metrics.py", 32), - TestFile("test_mla.py", 162), + TestFile("test_mla.py", 242), TestFile("test_mla_deepseek_v3.py", 221), - TestFile("test_mla_int8_deepseek_v3.py", 522), + TestFile("test_mla_int8_deepseek_v3.py", 674), TestFile("test_mla_flashinfer.py", 395), - TestFile("test_mla_fp8.py", 93), + TestFile("test_mla_fp8.py", 153), TestFile("test_no_chunked_prefill.py", 126), TestFile("test_no_overlap_scheduler.py", 262), - TestFile("test_openai_server.py", 186), + TestFile("test_openai_server.py", 149), TestFile("test_penalty.py", 41), TestFile("test_page_size.py", 60), TestFile("test_pytorch_sampling_backend.py", 66), @@ -57,11 +57,11 @@ suites = { TestFile("test_request_length_validation.py", 31), TestFile("test_retract_decode.py", 54), TestFile("test_server_args.py", 1), - TestFile("test_skip_tokenizer_init.py", 72), + TestFile("test_skip_tokenizer_init.py", 117), TestFile("test_srt_engine.py", 237), TestFile("test_srt_endpoint.py", 94), TestFile("test_torch_compile.py", 76), - TestFile("test_torch_compile_moe.py", 85), + TestFile("test_torch_compile_moe.py", 235), TestFile("test_torch_native_attention_backend.py", 123), TestFile("test_torchao.py", 70), TestFile("test_triton_attention_kernels.py", 4), @@ -69,27 +69,27 @@ suites = { TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_vertex_endpoint.py", 31), - TestFile("test_vision_chunked_prefill.py", 99), + TestFile("test_vision_chunked_prefill.py", 119), TestFile("test_vlm_accuracy.py", 60), - TestFile("test_vision_openai_server.py", 537), + TestFile("test_vision_openai_server.py", 637), TestFile("test_fim_completion.py", 40), TestFile("test_w8a8_quantization.py", 46), TestFile("test_eval_fp8_accuracy.py", 303), TestFile("test_create_kvindices.py", 2), - TestFile("test_hicache.py", 60), - TestFile("test_hicache_mla.py", 90), + TestFile("test_hicache.py", 116), + TestFile("test_hicache_mla.py", 254), TestFile("test_fused_moe.py", 30), TestFile("test_triton_moe_channel_fp8_kernel.py", 25), ], "per-commit-2-gpu": [ - TestFile("models/lora/test_lora_tp.py", 150), - TestFile("test_data_parallelism.py", 90), - TestFile("test_dp_attention.py", 150), - TestFile("test_mla_tp.py", 174), - TestFile("test_moe_ep.py", 220), - TestFile("test_patch_torch.py", 30), - TestFile("test_update_weights_from_distributed.py", 100), - TestFile("test_verl_engine.py", 100), + TestFile("models/lora/test_lora_tp.py", 116), + TestFile("test_data_parallelism.py", 73), + TestFile("test_dp_attention.py", 137), + TestFile("test_mla_tp.py", 170), + TestFile("test_moe_ep.py", 181), + TestFile("test_patch_torch.py", 19), + TestFile("test_update_weights_from_distributed.py", 103), + TestFile("test_verl_engine.py", 64), ], "per-commit-8-gpu": [ TestFile("test_local_attn.py", 250), diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py index 34c80d450..42415b155 100644 --- a/test/srt/test_torch_compile_moe.py +++ b/test/srt/test_torch_compile_moe.py @@ -24,7 +24,7 @@ class TestTorchCompileMoe(CustomTestCase): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--enable-torch-compile", "--torch-compile-max-bs", "8"], + other_args=["--enable-torch-compile", "--torch-compile-max-bs", "4"], ) @classmethod diff --git a/test/srt/test_update_weights_from_distributed.py b/test/srt/test_update_weights_from_distributed.py index 7352e757a..e558a56e3 100644 --- a/test/srt/test_update_weights_from_distributed.py +++ b/test/srt/test_update_weights_from_distributed.py @@ -129,7 +129,7 @@ def init_process_hf( hf_instruct_params = [] hf_base_params = [] - print("get parameter in hf instruct model and base model") + print("[hf] get parameter in hf instruct model and base model") for parameter_name in checking_parameters: hf_instruct_params.append( hf_instruct_model.get_parameter(parameter_name)[:truncate_size] @@ -152,10 +152,12 @@ def init_process_hf( param_queue.put(("hf_base_params", hf_base_params)) # Init weight update group for rank 0 (the training engine in RLHF). - print(f"rank {rank} world_size: {world_size} init custom process group") + port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100 + init_method = f"tcp://localhost:{port}" + print(f"[hf] {rank=} {world_size=} init custom process group. {init_method=}") group = init_custom_process_group( backend="nccl", - init_method="tcp://localhost:65500", + init_method=init_method, world_size=world_size, rank=rank, group_name="test_parameter_update_group", @@ -184,7 +186,7 @@ def init_process_hf( # Measure the latency of broadcasting/weights update. broadcast_time = time_end_broadcast - time_begin_broadcast - print(f"rank {rank} broadcast parameter time: {broadcast_time:.3f}s") + print(f"[hf] {rank=} {broadcast_time=:.3f}s") param_queue.put(("broadcast_time", broadcast_time)) # Delete the huggingface models to free up memory. @@ -210,17 +212,21 @@ def init_process_sgl( torch.cuda.synchronize() base_gpu_id = 1 if rank == 1 else 1 + tp_size if backend == "Engine": + print(f"[sgl] rank {rank} init engine") engine = sgl.Engine( model_path=model_name, - random_seed=42, base_gpu_id=base_gpu_id, tp_size=tp_size, + cuda_graph_max_bs=2, ) else: if rank == 1: url = DEFAULT_URL_FOR_TEST else: - url = DEFAULT_URL_FOR_TEST.replace("2157", "2159") + host, port = DEFAULT_URL_FOR_TEST.split(":") + url = ":".join(host, str(int(port) + 10000)) + + print(f"[sgl] rank {rank} init server on url: {url}") process = popen_launch_server( model_name, url, @@ -230,13 +236,11 @@ def init_process_sgl( str(base_gpu_id), "--tp-size", str(tp_size), + "--cuda-graph-max-bs", + 2, ), ) torch.cuda.synchronize() - if backend == "Engine": - print(f"rank {rank} init engine") - else: - print(f"rank {rank} init server on url: {url}") # Get weights of instruct model, i.e. pre-training weights. instruct_params = [] @@ -252,11 +256,13 @@ def init_process_sgl( param_queue.put((f"sgl_dp_{rank}_instruct_params", instruct_params)) + port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100 + # Init weight update group with the training engine. if backend == "Engine": engine.init_weights_update_group( master_address="localhost", - master_port="65500", + master_port=str(port), rank_offset=base_gpu_id, world_size=world_size, group_name="test_parameter_update_group", @@ -267,7 +273,7 @@ def init_process_sgl( f"{url}/init_weights_update_group", json={ "master_address": "localhost", - "master_port": "65500", + "master_port": str(port), "rank_offset": base_gpu_id, "world_size": world_size, "group_name": "test_parameter_update_group", @@ -311,7 +317,7 @@ def init_process_sgl( # Measure the latency of broadcast/weights update. update_time = time_end_update - time_begin_update print( - f"fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s" + f"[sgl] fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s" ) param_queue.put((f"update_sgl_dp_{rank}_time", update_time))