From b9a54e0968970a5f8021650136175660ea7ee565 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 17 Oct 2025 14:25:22 -0700 Subject: [PATCH] [minor] sync code on python/sglang/test/test_deterministic.py and improve ci tests (#11777) Co-authored-by: Stefan He Co-authored-by: Byron Hsu --- .github/workflows/pr-test.yml | 12 +- .../benchmark_and_profiling.md | 6 + python/sglang/srt/managers/io_struct.py | 2 + python/sglang/srt/metrics/collector.py | 13 + python/sglang/srt/server_args.py | 5 + .../test/attention/test_flashattn_backend.py | 2 +- python/sglang/test/test_deterministic.py | 244 ++++++++++++++++-- test/srt/rl/test_fp32_lm_head.py | 1 - test/srt/run_suite.py | 18 +- 9 files changed, 264 insertions(+), 39 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 41d5d2a17..98b65bf59 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -319,7 +319,7 @@ jobs: cd test/srt python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 - unit-test-backend-8-gpu: + unit-test-backend-8-gpu-h200: needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] if: always() && !failure() && !cancelled() && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) @@ -348,7 +348,7 @@ jobs: timeout-minutes: 20 run: | cd test/srt - python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 unit-test-backend-8-gpu-h20: needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] @@ -695,7 +695,7 @@ jobs: timeout-minutes: 20 run: | cd test/srt - python3 run_suite.py --suite per-commit-8-gpu-deepep + python3 run_suite.py --suite per-commit-8-gpu-h200-deepep unit-test-backend-8-gpu-deepseek-v32: needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] @@ -722,7 +722,7 @@ jobs: timeout-minutes: 20 run: | cd test/srt - python3 run_suite.py --suite per-commit-8-gpu-deepseek-v32 + python3 run_suite.py --suite per-commit-8-gpu-h200-deepseek-v32 unit-test-backend-4-gpu-b200: needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] @@ -761,12 +761,12 @@ jobs: sgl-kernel-unit-test, sgl-kernel-mla-test, sgl-kernel-benchmark-test, unit-test-frontend, unit-test-backend-1-gpu, - unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu, + unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu-h200, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-1-gpu-part-3, performance-test-2-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, - # unit-test-backend-4-gpu-b200, + unit-test-backend-4-gpu-b200, ] if: always() runs-on: ubuntu-latest diff --git a/docs/developer_guide/benchmark_and_profiling.md b/docs/developer_guide/benchmark_and_profiling.md index dc5f884d4..458ca5fd0 100644 --- a/docs/developer_guide/benchmark_and_profiling.md +++ b/docs/developer_guide/benchmark_and_profiling.md @@ -116,6 +116,12 @@ python3 -m sglang.test.send_one python3 -m sglang.profiler ``` +You can also combine the above operations into a single command + +``` +python3 -m sglang.test.send_one --profile +``` + ### Profiler Trace Merger for Distributed Traces SGLang now supports automatic merging of profiling traces from distributed setups with multiple parallelism types (TP, DP, PP, EP). This feature is particularly useful for analyzing performance across distributed runs. diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 5a7e5ec6d..4aa07411b 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -879,6 +879,8 @@ class BatchMultimodalDecodeReq(BaseBatchReq): placeholder_tokens_idx: List[Optional[List[int]]] placeholder_tokens_val: List[Optional[List[int]]] + return_bytes: List[bool] + # The trainer step id. Used to know which step's weights are used for sampling. token_steps: List[List[int]] = None diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py index bd3204079..42002d36e 100644 --- a/python/sglang/srt/metrics/collector.py +++ b/python/sglang/srt/metrics/collector.py @@ -150,6 +150,9 @@ class SchedulerStats: engine_startup_time: float = 0.0 engine_load_weights_time: float = 0.0 + # CUDA graph + is_cuda_graph: float = 0.0 + class SchedulerMetricsCollector: @@ -499,6 +502,13 @@ class SchedulerMetricsCollector: labelnames=list(labels.keys()) + ["stage"], ) + self.is_cuda_graph = Gauge( + name="sglang:is_cuda_graph", + documentation="Whether the batch is using CUDA graph.", + labelnames=labels.keys(), + multiprocess_mode="mostrecent", + ) + def _log_gauge(self, gauge, data: Union[int, float]) -> None: # Convenience function for logging to gauge. gauge.labels(**self.labels).set(data) @@ -574,6 +584,9 @@ class SchedulerMetricsCollector: self.engine_load_weights_time, stats.engine_load_weights_time ) + # CUDA graph + self._log_gauge(self.is_cuda_graph, stats.is_cuda_graph) + self.last_log_time = time.perf_counter() def log_grammar_stats(self, grammar_stats) -> None: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index dfb341128..9de754d35 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -509,6 +509,11 @@ class ServerArgs: """ Orchestrates the handling of various server arguments, ensuring proper configuration and validation. """ + + if self.model_path.lower() in ["none", "dummy"]: + # Skip for dummy models + return + # Handle deprecated arguments. self._handle_deprecated_args() diff --git a/python/sglang/test/attention/test_flashattn_backend.py b/python/sglang/test/attention/test_flashattn_backend.py index 5e5ebbaf1..719b4d1b0 100644 --- a/python/sglang/test/attention/test_flashattn_backend.py +++ b/python/sglang/test/attention/test_flashattn_backend.py @@ -66,7 +66,7 @@ class MockModelRunner: enable_memory_saver=False, ) # Required by torch native backend - self.server_args = ServerArgs(model_path="fake_model_path") + self.server_args = ServerArgs(model_path="dummy") @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA") diff --git a/python/sglang/test/test_deterministic.py b/python/sglang/test/test_deterministic.py index 1175a35e5..c889280a5 100644 --- a/python/sglang/test/test_deterministic.py +++ b/python/sglang/test/test_deterministic.py @@ -2,7 +2,17 @@ Batch the same prompt in random batch sizes, and test if the results are consistent across different trials. Usage: -python3 -m sglang.test.test_deterministic --n-trials --test-mode --profile +# Single mode: test determinism with varying batch sizes +python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode single + +# Mixed mode: test with mixed prompts +python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode mixed + +# Prefix mode: test with shared prefixes +python3 -m sglang.test.test_deterministic --n-start 1 --n-trials 50 --test-mode prefix + +# Radix Cache Consistency mode: test radix cache determinism (cached vs uncached prefill) +python3 -m sglang.test.test_deterministic --test-mode radix_cache """ import argparse @@ -67,7 +77,12 @@ class BenchArgs: "--test-mode", type=str, default=BenchArgs.test_mode, - choices=["single", "mixed", "prefix"], + choices=[ + "single", + "mixed", + "prefix", + "radix_cache", + ], ) parser.add_argument("--profile", action="store_true") parser.add_argument( @@ -83,26 +98,50 @@ class BenchArgs: def send_single( args, - batch_size: int, + batch_size: int = 1, profile: bool = False, profile_steps: int = 3, profile_by_stage: bool = False, + return_full_response: bool = False, + input_ids: List[int] = None, + max_new_tokens: int = None, ): - base_url = f"http://{args.host}:{args.port}" - prompt = [PROMPT_1] * batch_size - json_data = { - "text": prompt, - "sampling_params": { - "temperature": args.temperature, - "max_new_tokens": args.max_new_tokens, - "frequency_penalty": args.frequency_penalty, - "presence_penalty": args.presence_penalty, - }, - "return_logprob": args.return_logprob, - "stream": args.stream, - } + # Use input_ids if provided, otherwise use text prompts + if input_ids is not None: + json_data = { + "input_ids": input_ids, + "sampling_params": { + "temperature": args.temperature, + "max_new_tokens": ( + max_new_tokens + if max_new_tokens is not None + else args.max_new_tokens + ), + "frequency_penalty": args.frequency_penalty, + "presence_penalty": args.presence_penalty, + }, + "return_logprob": args.return_logprob, + "stream": args.stream, + } + else: + prompt = [PROMPT_1] * batch_size + json_data = { + "text": prompt, + "sampling_params": { + "temperature": args.temperature, + "max_new_tokens": ( + max_new_tokens + if max_new_tokens is not None + else args.max_new_tokens + ), + "frequency_penalty": args.frequency_penalty, + "presence_penalty": args.presence_penalty, + }, + "return_logprob": args.return_logprob, + "stream": args.stream, + } if args.sampling_seed is not None: # sglang server cannot parse None value for sampling_seed @@ -119,6 +158,11 @@ def send_single( stream=args.stream, ) + if response.status_code != 200: + ret = response.json() + print(f"Error: {ret}") + return None + if args.stream: for chunk in response.iter_lines(decode_unicode=False): chunk = chunk.decode("utf-8") @@ -128,13 +172,13 @@ def send_single( ret = json.loads(chunk[5:].strip("\n")) else: ret = response.json() - ret = ret[0] - if response.status_code != 200: - print(ret) - return -1 + ret = ret[0] if isinstance(ret, list) else ret - return ret["text"] + if return_full_response: + return ret + else: + return ret["text"] def send_mixed(args, batch_size: int): @@ -235,7 +279,6 @@ def test_deterministic(args): text = text.replace("\n", " ") print(f"Trial {i} with batch size {batch_size}: {text}") texts.append(text) - print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}") return [len(set(texts))] @@ -297,6 +340,163 @@ def test_deterministic(args): results.append(len(set(outputs[i]))) return results + elif args.test_mode == "radix_cache": + # Radix mode requires logprobs to compare results + args.return_logprob = True + + print("\n=== Prefill Cache Consistency Test ===") + print( + "This test verifies prefill request produces consistent logprobs w/ and w/o cache.\n" + ) + + # We noticed that we cannot call flush cache before any request, otherwise it will hang. + warmup_response = send_single( + args, input_ids=[1] * 64, max_new_tokens=65, return_full_response=True + ) + + # Flush cache first to make sure there is no cache hit from previous tests + flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache") + + print(f"Step 1: Generating random 64 token IDs...") + # Use a reasonable token ID range (e.g., 1-50000 for most tokenizers) + # Avoid special tokens like 0 (padding), 1 (BOS), 2 (EOS) + # set seed for random.randint + random.seed(42) + initial_token_ids = [random.randint(100, 50000) for _ in range(64)] + + print(f"✓ Using {len(initial_token_ids)} initial tokens") + print(f" Initial token IDs: {initial_token_ids}") + + print( + f"\nStep 2: Generating 2 tokens from {len(initial_token_ids)} token prefix..." + ) + first_response = send_single( + args, + input_ids=initial_token_ids, + max_new_tokens=100, + return_full_response=True, + ) + first_output_text = first_response["text"] + first_output_token_ids = first_response["output_ids"] + first_output_logprobs = first_response["meta_info"]["output_token_logprobs"] + + expected_token_id = first_output_token_ids[-1] + expected_logprob = first_output_logprobs[-1][0] + + print(f"✓ Generated {len(first_output_token_ids)} tokens") + print(f' Output text: "{first_output_text}"') + + print( + f"\nStep 3: Generating with radix cache (164 tokens prefill, should hit > 128 tokens cache, based on page size)..." + ) + prefix_token_ids = initial_token_ids + first_output_token_ids[:-1] + print( + f" Prefix: {len(initial_token_ids)} initial + 64 generated = {len(prefix_token_ids)} tokens" + ) + print(f"Using Prompt: {prefix_token_ids}") + cached_response = send_single( + args, + input_ids=prefix_token_ids, + max_new_tokens=1, + return_full_response=True, + ) + cached_logprobs = cached_response["meta_info"]["output_token_logprobs"] + cached_token_data = cached_logprobs[0] + cached_logprob = cached_token_data[0] + cached_token_id = cached_token_data[1] + + print(f"✓ Generated with cache:") + print(f" Token ID: {cached_token_id}") + print(f" Logprob: {cached_logprob:.10f}") + + print(f"\nStep 4: Flushing cache...") + flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache") + + print( + f"\nStep 5: Generating without cache (same 164 tokens prefill, no cache)..." + ) + print(f"Using Prompt: {prefix_token_ids}") + + uncached_response = send_single( + args, + input_ids=prefix_token_ids, + max_new_tokens=1, + return_full_response=True, + ) + + uncached_logprobs = uncached_response["meta_info"]["output_token_logprobs"] + uncached_token_data = uncached_logprobs[0] + uncached_logprob = uncached_token_data[0] + uncached_token_id = uncached_token_data[1] + + print(f"✓ Generated without cache:") + print(f" Token ID: {uncached_token_id}") + print(f" Logprob: {uncached_logprob:.10f}") + + # Step 6: Compare results + print(f"\n{'='*60}") + print("Comparison 1: Decode (Request 1) vs Prefill with Cache (Request 2)") + print("=" * 60) + + # Compare first request (decode) vs second request (prefill with cache) + # We expect them to be different (different kernels) + decode_vs_prefill_token_match = expected_token_id == cached_token_id + decode_vs_prefill_logprob_match = expected_logprob == cached_logprob + + print( + f" Decode token (Request 1): ID={expected_token_id}, logprob={expected_logprob:.10f}" + ) + print( + f" Prefill w/ cache token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}" + ) + print( + f" Token ID match: {'✓ YES' if decode_vs_prefill_token_match else '✗ NO'}" + ) + print( + f" Logprob match: {'✓ YES' if decode_vs_prefill_logprob_match else '✗ NO'}" + ) + if not decode_vs_prefill_logprob_match: + diff = abs(expected_logprob - cached_logprob) + print(f" Logprob difference: {diff:.10e}") + print(f" Note: We expect these to be DIFFERENT (decode vs prefill kernels)") + + print(f"\n{'='*60}") + print( + "Comparison 2: Cached Prefill (Request 2) vs Uncached Prefill (Request 3)" + ) + print("=" * 60) + + # Main test: compare cached vs uncached prefill (should be identical) + token_match = cached_token_id == uncached_token_id + logprob_match = cached_logprob == uncached_logprob + + print( + f" Cached prefill token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}" + ) + print( + f" Uncached prefill token (Request 3): ID={uncached_token_id}, logprob={uncached_logprob:.10f}" + ) + print(f" Token ID match: {'✓ YES' if token_match else '✗ NO'}") + if not token_match: + print(f" Cached: {cached_token_id}") + print(f" Uncached: {uncached_token_id}") + + print(f" Logprob match: {'✓ YES' if logprob_match else '✗ NO'}") + if not logprob_match: + print(f" Cached: {cached_logprob:.10f}") + print(f" Uncached: {uncached_logprob:.10f}") + diff = abs(cached_logprob - uncached_logprob) + print(f" Difference: {diff:.10e}") + print(f" Note: We expect these to be IDENTICAL (both prefill kernels)") + + print(f"\n{'='*60}") + if token_match and logprob_match: + print("✓✓✓ TEST PASSED - Radix cache is consistent! ✓✓✓") + return [1] + else: + print("✗✗✗ TEST FAILED - Radix cache produces different results! ✗✗✗") + return [0] + else: raise ValueError(f"Invalid test mode: {args.test_mode}") diff --git a/test/srt/rl/test_fp32_lm_head.py b/test/srt/rl/test_fp32_lm_head.py index dea43995b..cf6dd2839 100644 --- a/test/srt/rl/test_fp32_lm_head.py +++ b/test/srt/rl/test_fp32_lm_head.py @@ -36,7 +36,6 @@ class TestLMHeadFP32(unittest.TestCase): raise unittest.SkipTest("needs CUDA GPU") def _make_logprocessor(self, vocab_size, enable_fp32): - ServerArgs.__post_init__ = lambda self: None # disable validation set_global_server_args_for_scheduler(ServerArgs(model_path="dummy")) get_global_server_args().enable_dp_lm_head = False get_global_server_args().enable_fp32_lm_head = enable_fp32 diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 7d1a12eef..d046f35d0 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -66,10 +66,10 @@ suites = { TestFile("rl/test_update_weights_from_disk.py", 114), TestFile("rl/test_update_weights_from_tensor.py", 48), TestFile("test_abort.py", 51), + TestFile("test_build_eagle_tree.py", 8), TestFile("test_chunked_prefill.py", 313), TestFile("test_create_kvindices.py", 2), TestFile("test_deterministic.py", 300), - TestFile("test_build_eagle_tree.py", 8), TestFile("test_eagle_infer_a.py", 370), TestFile("test_eagle_infer_b.py", 700), TestFile("test_eagle_infer_beta.py", 300), @@ -158,12 +158,17 @@ suites = { TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_pp_single_node.py", 481), ], - "per-commit-8-gpu": [ + "per-commit-8-gpu-h200": [ TestFile("lora/test_lora_llama4.py", 400), TestFile("test_deepseek_v3_basic.py", 275), TestFile("test_deepseek_v3_mtp.py", 275), TestFile("test_disaggregation_hybrid_attention.py", 200), ], + "per-commit-8-gpu-h20": [ + TestFile("quant/test_w4a8_deepseek_v3.py", 371), + TestFile("test_disaggregation_different_tp.py", 600), + TestFile("test_disaggregation_pp.py", 140), + ], "per-commit-4-gpu-b200": [ # TestFile("test_gpt_oss_4gpu.py", 600), # TestFile("test_deepseek_v3_fp4_4gpu.py", 3600), @@ -172,17 +177,12 @@ suites = { TestFile("ep/test_deepep_small.py", 531), TestFile("ep/test_mooncake_ep_small.py", 450), ], - "per-commit-8-gpu-deepep": [ + "per-commit-8-gpu-h200-deepep": [ TestFile("ep/test_deepep_large.py", 338), ], - "per-commit-8-gpu-deepseek-v32": [ + "per-commit-8-gpu-h200-deepseek-v32": [ TestFile("test_deepseek_v32_basic.py", 275), ], - "per-commit-8-gpu-h20": [ - TestFile("test_disaggregation_different_tp.py", 600), - TestFile("test_disaggregation_pp.py", 140), - TestFile("quant/test_w4a8_deepseek_v3.py", 371), - ], "vllm_dependency_test": [ TestFile("quant/test_awq.py", 163), TestFile("test_bnb.py", 5),