diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 5ac065973..3b33b319d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -95,7 +95,7 @@ jobs: strategy: fail-fast: false matrix: - range: [0-6, 6-15, 15-22, 22-32, 32-40, 40-48, 48-100] + part: [0, 1, 2, 3, 4, 5, 6] steps: - name: Checkout code uses: actions/checkout@v3 @@ -109,11 +109,8 @@ jobs: - name: Run test timeout-minutes: 30 run: | - RANGE=${{ matrix.range }} - range_begin=${RANGE%-*} - range_end=${RANGE#*-} cd test/srt - python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end} + python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 7 unit-test-backend-2-gpu: needs: filter @@ -340,7 +337,6 @@ jobs: python3 test_moe_eval_accuracy_large.py finish: - if: always() needs: [ unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index a8bf674a9..e2d92bb51 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -446,22 +446,31 @@ def run_with_timeout( return ret_value[0] -def run_unittest_files(files: List[str], timeout_per_file: float): +def run_unittest_files(files: List, timeout_per_file: float): tic = time.time() success = True - for filename in files: + for file in files: + filename, estimated_time = file.name, file.estimated_time process = None def run_one_file(filename): nonlocal process filename = os.path.join(os.getcwd(), filename) - print(f"\n\nRun:\npython3 {filename}\n\n", flush=True) + print(f".\n.\nBegin:\npython3 {filename}\n.\n.\n", flush=True) + tic = time.time() + process = subprocess.Popen( ["python3", filename], stdout=None, stderr=None, env=os.environ ) process.wait() + elapsed = time.time() - tic + + print( + f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n", + flush=True, + ) return process.returncode try: diff --git a/test/lang/run_suite.py b/test/lang/run_suite.py index 327d18b3f..e4e11ca4f 100644 --- a/test/lang/run_suite.py +++ b/test/lang/run_suite.py @@ -1,11 +1,19 @@ import argparse import glob +from dataclasses import dataclass from sglang.test.test_utils import run_unittest_files + +@dataclass +class TestFile: + name: str + estimated_time: float = 60 + + suites = { "per-commit": [ - "test_srt_backend.py", + TestFile("test_srt_backend.py"), # Skip this due to some OPENAI_API_KEY issues # "test_openai_backend.py", ], diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index ebab2bf68..b9e36e232 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -1,85 +1,125 @@ import argparse import glob +from dataclasses import dataclass from sglang.test.test_utils import run_unittest_files + +@dataclass +class TestFile: + name: str + estimated_time: float = 60 + + suites = { "per-commit": [ - "models/lora/test_lora.py", - "models/lora/test_lora_backend.py", - "models/lora/test_multi_lora_backend.py", - "models/test_embedding_models.py", - "models/test_generation_models.py", - "models/test_qwen_models.py", - "models/test_reward_models.py", - "test_gptqmodel_dynamic.py", - "models/test_gme_qwen_models.py", - "test_abort.py", - "test_chunked_prefill.py", - "test_custom_allreduce.py", - "test_double_sparsity.py", - "test_eagle_infer.py", - "test_embedding_openai_server.py", - "test_eval_accuracy_mini.py", - "test_gguf.py", - "test_input_embeddings.py", - "test_mla.py", - "test_mla_deepseek_v3.py", - "test_mla_flashinfer.py", - "test_mla_fp8.py", - "test_json_constrained.py", - "test_large_max_new_tokens.py", - "test_metrics.py", - "test_no_chunked_prefill.py", - "test_no_overlap_scheduler.py", - "test_openai_server.py", - "test_penalty.py", - "test_pytorch_sampling_backend.py", - "test_radix_attention.py", - "test_regex_constrained.py", - "test_release_memory_occupation.py", - "test_request_length_validation.py", - "test_retract_decode.py", - "test_server_args.py", - # Disabled temporarily - # "test_session_control.py", - "test_skip_tokenizer_init.py", - "test_srt_engine.py", - "test_srt_endpoint.py", - "test_torch_compile.py", - "test_torch_compile_moe.py", - "test_torch_native_attention_backend.py", - "test_torchao.py", - "test_triton_attention_kernels.py", - "test_triton_attention_backend.py", - "test_hidden_states.py", - "test_update_weights_from_disk.py", - "test_update_weights_from_tensor.py", - "test_vertex_endpoint.py", - "test_vision_chunked_prefill.py", - "test_vision_llm.py", - "test_vision_openai_server.py", - "test_w8a8_quantization.py", - "test_fp8_kernel.py", - "test_block_int8.py", - "test_int8_kernel.py", - "test_reasoning_content.py", + TestFile("models/lora/test_lora.py", 76), + TestFile("models/lora/test_lora_backend.py", 420), + TestFile("models/lora/test_multi_lora_backend.py", 1), + TestFile("models/test_embedding_models.py", 119), + TestFile("models/test_generation_models.py", 103), + TestFile("models/test_qwen_models.py", 82), + TestFile("models/test_reward_models.py", 83), + TestFile("test_gptqmodel_dynamic.py", 72), + TestFile("models/test_gme_qwen_models.py", 45), + TestFile("test_abort.py", 51), + TestFile("test_chunked_prefill.py", 336), + TestFile("test_custom_allreduce.py", 1), + TestFile("test_double_sparsity.py", 50), + TestFile("test_eagle_infer.py", 447), + TestFile("test_embedding_openai_server.py", 36), + TestFile("test_eval_accuracy_mini.py", 63), + TestFile("test_gguf.py", 78), + TestFile("test_input_embeddings.py", 38), + TestFile("test_mla.py", 92), + TestFile("test_mla_deepseek_v3.py", 221), + TestFile("test_mla_flashinfer.py", 395), + TestFile("test_mla_fp8.py", 93), + TestFile("test_json_constrained.py", 98), + TestFile("test_large_max_new_tokens.py", 41), + TestFile("test_metrics.py", 32), + TestFile("test_no_chunked_prefill.py", 126), + TestFile("test_no_overlap_scheduler.py", 262), + TestFile("test_openai_server.py", 124), + TestFile("test_penalty.py", 41), + TestFile("test_pytorch_sampling_backend.py", 66), + TestFile("test_radix_attention.py", 167), + TestFile("test_regex_constrained.py", 64), + TestFile("test_release_memory_occupation.py", 44), + TestFile("test_request_length_validation.py", 31), + TestFile("test_retract_decode.py", 54), + TestFile("test_server_args.py", 1), + TestFile("test_skip_tokenizer_init.py", 72), + TestFile("test_srt_engine.py", 237), + TestFile("test_srt_endpoint.py", 94), + TestFile("test_torch_compile.py", 76), + TestFile("test_torch_compile_moe.py", 85), + TestFile("test_torch_native_attention_backend.py", 149), + TestFile("test_torchao.py", 70), + TestFile("test_triton_attention_kernels.py", 4), + TestFile("test_triton_attention_backend.py", 134), + TestFile("test_hidden_states.py", 55), + TestFile("test_update_weights_from_disk.py", 114), + TestFile("test_update_weights_from_tensor.py", 48), + TestFile("test_vertex_endpoint.py", 31), + TestFile("test_vision_chunked_prefill.py", 223), + TestFile("test_vision_llm.py", 18.4), + TestFile("test_vision_openai_server.py", 344), + TestFile("test_w8a8_quantization.py", 46), + TestFile("test_fp8_kernel.py", 2), + TestFile("test_block_int8.py", 22), + TestFile("test_int8_kernel.py", 1), + TestFile("test_reasoning_content.py", 89), ], "nightly": [ - "test_nightly_gsm8k_eval.py", - # Disable temporarily - # "test_nightly_math_eval.py", + TestFile("test_nightly_gsm8k_eval.py"), ], } -# Expand suite -for target_suite_name, target_tests in suites.items(): - for suite_name, tests in suites.items(): - if suite_name == target_suite_name: - continue - if target_suite_name in tests: - tests.remove(target_suite_name) - tests.extend(target_tests) + +def auto_partition(files, rank, size): + """ + Partition files into size sublists with approximately equal sums of estimated times + using stable sorting, and return the partition for the specified rank. + + Args: + files (list): List of file objects with estimated_time attribute + rank (int): Index of the partition to return (0 to size-1) + size (int): Number of partitions + + Returns: + list: List of file objects in the specified rank's partition + """ + weights = [f.estimated_time for f in files] + + if not weights or size <= 0 or size > len(weights): + return [] + + # Create list of (weight, original_index) tuples + # Using negative index as secondary key to maintain original order for equal weights + indexed_weights = [(w, -i) for i, w in enumerate(weights)] + # Stable sort in descending order by weight + # If weights are equal, larger (negative) index comes first (i.e., earlier original position) + indexed_weights = sorted(indexed_weights, reverse=True) + + # Extract original indices (negate back to positive) + indexed_weights = [(w, -i) for w, i in indexed_weights] + + # Initialize partitions and their sums + partitions = [[] for _ in range(size)] + sums = [0.0] * size + + # Greedy approach: assign each weight to partition with smallest current sum + for weight, idx in indexed_weights: + # Find partition with minimum sum + min_sum_idx = sums.index(min(sums)) + partitions[min_sum_idx].append(idx) + sums[min_sum_idx] += weight + + # Return the files corresponding to the indices in the specified rank's partition + indices = partitions[rank] + return [files[i] for i in indices] + if __name__ == "__main__": arg_parser = argparse.ArgumentParser() @@ -108,17 +148,30 @@ if __name__ == "__main__": default=None, help="The end index of the range of the files to run.", ) + arg_parser.add_argument( + "--auto-partition-id", + type=int, + help="Use auto load balancing. The part id.", + ) + arg_parser.add_argument( + "--auto-partition-size", + type=int, + help="Use auto load balancing. The number of parts.", + ) args = arg_parser.parse_args() + print(f"{args=}") if args.suite == "all": files = glob.glob("**/test_*.py", recursive=True) else: files = suites[args.suite] - files = files[args.range_begin : args.range_end] + if args.auto_partition_size: + files = auto_partition(files, args.auto_partition_id, args.auto_partition_size) + else: + files = files[args.range_begin : args.range_end] - print(f"{args=}") - print("The running tests are ", files) + print("The running tests are ", [f.name for f in files]) exit_code = run_unittest_files(files, args.timeout_per_file) exit(exit_code) diff --git a/test/srt/test_custom_allreduce.py b/test/srt/test_custom_allreduce.py index 5f6f5d9b4..7ac0c8ffc 100644 --- a/test/srt/test_custom_allreduce.py +++ b/test/srt/test_custom_allreduce.py @@ -42,7 +42,8 @@ def multi_process_parallel( # as compared to multiprocessing. # NOTE: We need to set working_dir for distributed tests, # otherwise we may get import errors on ray workers - ray.init(log_to_driver=False) + + ray.init(log_to_driver=True) distributed_init_port = get_open_port() refs = []