[minor] sync code on python/sglang/test/test_deterministic.py and improve ci tests (#11777)
Co-authored-by: Stefan He <hebiaobuaa@gmail.com> Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
This commit is contained in:
12
.github/workflows/pr-test.yml
vendored
12
.github/workflows/pr-test.yml
vendored
@@ -319,7 +319,7 @@ jobs:
|
|||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||||
|
|
||||||
unit-test-backend-8-gpu:
|
unit-test-backend-8-gpu-h200:
|
||||||
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||||
if: always() && !failure() && !cancelled() &&
|
if: always() && !failure() && !cancelled() &&
|
||||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||||
@@ -348,7 +348,7 @@ jobs:
|
|||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||||
|
|
||||||
unit-test-backend-8-gpu-h20:
|
unit-test-backend-8-gpu-h20:
|
||||||
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||||
@@ -695,7 +695,7 @@ jobs:
|
|||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-8-gpu-deepep
|
python3 run_suite.py --suite per-commit-8-gpu-h200-deepep
|
||||||
|
|
||||||
unit-test-backend-8-gpu-deepseek-v32:
|
unit-test-backend-8-gpu-deepseek-v32:
|
||||||
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||||
@@ -722,7 +722,7 @@ jobs:
|
|||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-8-gpu-deepseek-v32
|
python3 run_suite.py --suite per-commit-8-gpu-h200-deepseek-v32
|
||||||
|
|
||||||
unit-test-backend-4-gpu-b200:
|
unit-test-backend-4-gpu-b200:
|
||||||
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||||
@@ -761,12 +761,12 @@ jobs:
|
|||||||
sgl-kernel-unit-test, sgl-kernel-mla-test, sgl-kernel-benchmark-test,
|
sgl-kernel-unit-test, sgl-kernel-mla-test, sgl-kernel-benchmark-test,
|
||||||
|
|
||||||
unit-test-frontend, unit-test-backend-1-gpu,
|
unit-test-frontend, unit-test-backend-1-gpu,
|
||||||
unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
|
unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu-h200,
|
||||||
performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-1-gpu-part-3,
|
performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-1-gpu-part-3,
|
||||||
performance-test-2-gpu,
|
performance-test-2-gpu,
|
||||||
accuracy-test-1-gpu, accuracy-test-2-gpu,
|
accuracy-test-1-gpu, accuracy-test-2-gpu,
|
||||||
unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
|
unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
|
||||||
# unit-test-backend-4-gpu-b200,
|
unit-test-backend-4-gpu-b200,
|
||||||
]
|
]
|
||||||
if: always()
|
if: always()
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
@@ -116,6 +116,12 @@ python3 -m sglang.test.send_one
|
|||||||
python3 -m sglang.profiler
|
python3 -m sglang.profiler
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also combine the above operations into a single command
|
||||||
|
|
||||||
|
```
|
||||||
|
python3 -m sglang.test.send_one --profile
|
||||||
|
```
|
||||||
|
|
||||||
### Profiler Trace Merger for Distributed Traces
|
### Profiler Trace Merger for Distributed Traces
|
||||||
|
|
||||||
SGLang now supports automatic merging of profiling traces from distributed setups with multiple parallelism types (TP, DP, PP, EP). This feature is particularly useful for analyzing performance across distributed runs.
|
SGLang now supports automatic merging of profiling traces from distributed setups with multiple parallelism types (TP, DP, PP, EP). This feature is particularly useful for analyzing performance across distributed runs.
|
||||||
|
|||||||
@@ -879,6 +879,8 @@ class BatchMultimodalDecodeReq(BaseBatchReq):
|
|||||||
placeholder_tokens_idx: List[Optional[List[int]]]
|
placeholder_tokens_idx: List[Optional[List[int]]]
|
||||||
placeholder_tokens_val: List[Optional[List[int]]]
|
placeholder_tokens_val: List[Optional[List[int]]]
|
||||||
|
|
||||||
|
return_bytes: List[bool]
|
||||||
|
|
||||||
# The trainer step id. Used to know which step's weights are used for sampling.
|
# The trainer step id. Used to know which step's weights are used for sampling.
|
||||||
token_steps: List[List[int]] = None
|
token_steps: List[List[int]] = None
|
||||||
|
|
||||||
|
|||||||
@@ -150,6 +150,9 @@ class SchedulerStats:
|
|||||||
engine_startup_time: float = 0.0
|
engine_startup_time: float = 0.0
|
||||||
engine_load_weights_time: float = 0.0
|
engine_load_weights_time: float = 0.0
|
||||||
|
|
||||||
|
# CUDA graph
|
||||||
|
is_cuda_graph: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
class SchedulerMetricsCollector:
|
class SchedulerMetricsCollector:
|
||||||
|
|
||||||
@@ -499,6 +502,13 @@ class SchedulerMetricsCollector:
|
|||||||
labelnames=list(labels.keys()) + ["stage"],
|
labelnames=list(labels.keys()) + ["stage"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.is_cuda_graph = Gauge(
|
||||||
|
name="sglang:is_cuda_graph",
|
||||||
|
documentation="Whether the batch is using CUDA graph.",
|
||||||
|
labelnames=labels.keys(),
|
||||||
|
multiprocess_mode="mostrecent",
|
||||||
|
)
|
||||||
|
|
||||||
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
||||||
# Convenience function for logging to gauge.
|
# Convenience function for logging to gauge.
|
||||||
gauge.labels(**self.labels).set(data)
|
gauge.labels(**self.labels).set(data)
|
||||||
@@ -574,6 +584,9 @@ class SchedulerMetricsCollector:
|
|||||||
self.engine_load_weights_time, stats.engine_load_weights_time
|
self.engine_load_weights_time, stats.engine_load_weights_time
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# CUDA graph
|
||||||
|
self._log_gauge(self.is_cuda_graph, stats.is_cuda_graph)
|
||||||
|
|
||||||
self.last_log_time = time.perf_counter()
|
self.last_log_time = time.perf_counter()
|
||||||
|
|
||||||
def log_grammar_stats(self, grammar_stats) -> None:
|
def log_grammar_stats(self, grammar_stats) -> None:
|
||||||
|
|||||||
@@ -509,6 +509,11 @@ class ServerArgs:
|
|||||||
"""
|
"""
|
||||||
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
|
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if self.model_path.lower() in ["none", "dummy"]:
|
||||||
|
# Skip for dummy models
|
||||||
|
return
|
||||||
|
|
||||||
# Handle deprecated arguments.
|
# Handle deprecated arguments.
|
||||||
self._handle_deprecated_args()
|
self._handle_deprecated_args()
|
||||||
|
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class MockModelRunner:
|
|||||||
enable_memory_saver=False,
|
enable_memory_saver=False,
|
||||||
)
|
)
|
||||||
# Required by torch native backend
|
# Required by torch native backend
|
||||||
self.server_args = ServerArgs(model_path="fake_model_path")
|
self.server_args = ServerArgs(model_path="dummy")
|
||||||
|
|
||||||
|
|
||||||
@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
|
@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
|
||||||
|
|||||||
@@ -2,7 +2,17 @@
|
|||||||
Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.
|
Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 -m sglang.test.test_deterministic --n-trials <numer_of_trials> --test-mode <single|mixed|prefix> --profile
|
# Single mode: test determinism with varying batch sizes
|
||||||
|
python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode single
|
||||||
|
|
||||||
|
# Mixed mode: test with mixed prompts
|
||||||
|
python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode mixed
|
||||||
|
|
||||||
|
# Prefix mode: test with shared prefixes
|
||||||
|
python3 -m sglang.test.test_deterministic --n-start 1 --n-trials 50 --test-mode prefix
|
||||||
|
|
||||||
|
# Radix Cache Consistency mode: test radix cache determinism (cached vs uncached prefill)
|
||||||
|
python3 -m sglang.test.test_deterministic --test-mode radix_cache
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@@ -67,7 +77,12 @@ class BenchArgs:
|
|||||||
"--test-mode",
|
"--test-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default=BenchArgs.test_mode,
|
default=BenchArgs.test_mode,
|
||||||
choices=["single", "mixed", "prefix"],
|
choices=[
|
||||||
|
"single",
|
||||||
|
"mixed",
|
||||||
|
"prefix",
|
||||||
|
"radix_cache",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
parser.add_argument("--profile", action="store_true")
|
parser.add_argument("--profile", action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -83,26 +98,50 @@ class BenchArgs:
|
|||||||
|
|
||||||
def send_single(
|
def send_single(
|
||||||
args,
|
args,
|
||||||
batch_size: int,
|
batch_size: int = 1,
|
||||||
profile: bool = False,
|
profile: bool = False,
|
||||||
profile_steps: int = 3,
|
profile_steps: int = 3,
|
||||||
profile_by_stage: bool = False,
|
profile_by_stage: bool = False,
|
||||||
|
return_full_response: bool = False,
|
||||||
|
input_ids: List[int] = None,
|
||||||
|
max_new_tokens: int = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
base_url = f"http://{args.host}:{args.port}"
|
base_url = f"http://{args.host}:{args.port}"
|
||||||
prompt = [PROMPT_1] * batch_size
|
|
||||||
|
|
||||||
json_data = {
|
# Use input_ids if provided, otherwise use text prompts
|
||||||
"text": prompt,
|
if input_ids is not None:
|
||||||
"sampling_params": {
|
json_data = {
|
||||||
"temperature": args.temperature,
|
"input_ids": input_ids,
|
||||||
"max_new_tokens": args.max_new_tokens,
|
"sampling_params": {
|
||||||
"frequency_penalty": args.frequency_penalty,
|
"temperature": args.temperature,
|
||||||
"presence_penalty": args.presence_penalty,
|
"max_new_tokens": (
|
||||||
},
|
max_new_tokens
|
||||||
"return_logprob": args.return_logprob,
|
if max_new_tokens is not None
|
||||||
"stream": args.stream,
|
else args.max_new_tokens
|
||||||
}
|
),
|
||||||
|
"frequency_penalty": args.frequency_penalty,
|
||||||
|
"presence_penalty": args.presence_penalty,
|
||||||
|
},
|
||||||
|
"return_logprob": args.return_logprob,
|
||||||
|
"stream": args.stream,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
prompt = [PROMPT_1] * batch_size
|
||||||
|
json_data = {
|
||||||
|
"text": prompt,
|
||||||
|
"sampling_params": {
|
||||||
|
"temperature": args.temperature,
|
||||||
|
"max_new_tokens": (
|
||||||
|
max_new_tokens
|
||||||
|
if max_new_tokens is not None
|
||||||
|
else args.max_new_tokens
|
||||||
|
),
|
||||||
|
"frequency_penalty": args.frequency_penalty,
|
||||||
|
"presence_penalty": args.presence_penalty,
|
||||||
|
},
|
||||||
|
"return_logprob": args.return_logprob,
|
||||||
|
"stream": args.stream,
|
||||||
|
}
|
||||||
|
|
||||||
if args.sampling_seed is not None:
|
if args.sampling_seed is not None:
|
||||||
# sglang server cannot parse None value for sampling_seed
|
# sglang server cannot parse None value for sampling_seed
|
||||||
@@ -119,6 +158,11 @@ def send_single(
|
|||||||
stream=args.stream,
|
stream=args.stream,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
ret = response.json()
|
||||||
|
print(f"Error: {ret}")
|
||||||
|
return None
|
||||||
|
|
||||||
if args.stream:
|
if args.stream:
|
||||||
for chunk in response.iter_lines(decode_unicode=False):
|
for chunk in response.iter_lines(decode_unicode=False):
|
||||||
chunk = chunk.decode("utf-8")
|
chunk = chunk.decode("utf-8")
|
||||||
@@ -128,13 +172,13 @@ def send_single(
|
|||||||
ret = json.loads(chunk[5:].strip("\n"))
|
ret = json.loads(chunk[5:].strip("\n"))
|
||||||
else:
|
else:
|
||||||
ret = response.json()
|
ret = response.json()
|
||||||
ret = ret[0]
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
ret = ret[0] if isinstance(ret, list) else ret
|
||||||
print(ret)
|
|
||||||
return -1
|
|
||||||
|
|
||||||
return ret["text"]
|
if return_full_response:
|
||||||
|
return ret
|
||||||
|
else:
|
||||||
|
return ret["text"]
|
||||||
|
|
||||||
|
|
||||||
def send_mixed(args, batch_size: int):
|
def send_mixed(args, batch_size: int):
|
||||||
@@ -235,7 +279,6 @@ def test_deterministic(args):
|
|||||||
text = text.replace("\n", " ")
|
text = text.replace("\n", " ")
|
||||||
print(f"Trial {i} with batch size {batch_size}: {text}")
|
print(f"Trial {i} with batch size {batch_size}: {text}")
|
||||||
texts.append(text)
|
texts.append(text)
|
||||||
|
|
||||||
print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
|
print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
|
||||||
return [len(set(texts))]
|
return [len(set(texts))]
|
||||||
|
|
||||||
@@ -297,6 +340,163 @@ def test_deterministic(args):
|
|||||||
results.append(len(set(outputs[i])))
|
results.append(len(set(outputs[i])))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
elif args.test_mode == "radix_cache":
|
||||||
|
# Radix mode requires logprobs to compare results
|
||||||
|
args.return_logprob = True
|
||||||
|
|
||||||
|
print("\n=== Prefill Cache Consistency Test ===")
|
||||||
|
print(
|
||||||
|
"This test verifies prefill request produces consistent logprobs w/ and w/o cache.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# We noticed that we cannot call flush cache before any request, otherwise it will hang.
|
||||||
|
warmup_response = send_single(
|
||||||
|
args, input_ids=[1] * 64, max_new_tokens=65, return_full_response=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Flush cache first to make sure there is no cache hit from previous tests
|
||||||
|
flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
|
||||||
|
|
||||||
|
print(f"Step 1: Generating random 64 token IDs...")
|
||||||
|
# Use a reasonable token ID range (e.g., 1-50000 for most tokenizers)
|
||||||
|
# Avoid special tokens like 0 (padding), 1 (BOS), 2 (EOS)
|
||||||
|
# set seed for random.randint
|
||||||
|
random.seed(42)
|
||||||
|
initial_token_ids = [random.randint(100, 50000) for _ in range(64)]
|
||||||
|
|
||||||
|
print(f"✓ Using {len(initial_token_ids)} initial tokens")
|
||||||
|
print(f" Initial token IDs: {initial_token_ids}")
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\nStep 2: Generating 2 tokens from {len(initial_token_ids)} token prefix..."
|
||||||
|
)
|
||||||
|
first_response = send_single(
|
||||||
|
args,
|
||||||
|
input_ids=initial_token_ids,
|
||||||
|
max_new_tokens=100,
|
||||||
|
return_full_response=True,
|
||||||
|
)
|
||||||
|
first_output_text = first_response["text"]
|
||||||
|
first_output_token_ids = first_response["output_ids"]
|
||||||
|
first_output_logprobs = first_response["meta_info"]["output_token_logprobs"]
|
||||||
|
|
||||||
|
expected_token_id = first_output_token_ids[-1]
|
||||||
|
expected_logprob = first_output_logprobs[-1][0]
|
||||||
|
|
||||||
|
print(f"✓ Generated {len(first_output_token_ids)} tokens")
|
||||||
|
print(f' Output text: "{first_output_text}"')
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\nStep 3: Generating with radix cache (164 tokens prefill, should hit > 128 tokens cache, based on page size)..."
|
||||||
|
)
|
||||||
|
prefix_token_ids = initial_token_ids + first_output_token_ids[:-1]
|
||||||
|
print(
|
||||||
|
f" Prefix: {len(initial_token_ids)} initial + 64 generated = {len(prefix_token_ids)} tokens"
|
||||||
|
)
|
||||||
|
print(f"Using Prompt: {prefix_token_ids}")
|
||||||
|
cached_response = send_single(
|
||||||
|
args,
|
||||||
|
input_ids=prefix_token_ids,
|
||||||
|
max_new_tokens=1,
|
||||||
|
return_full_response=True,
|
||||||
|
)
|
||||||
|
cached_logprobs = cached_response["meta_info"]["output_token_logprobs"]
|
||||||
|
cached_token_data = cached_logprobs[0]
|
||||||
|
cached_logprob = cached_token_data[0]
|
||||||
|
cached_token_id = cached_token_data[1]
|
||||||
|
|
||||||
|
print(f"✓ Generated with cache:")
|
||||||
|
print(f" Token ID: {cached_token_id}")
|
||||||
|
print(f" Logprob: {cached_logprob:.10f}")
|
||||||
|
|
||||||
|
print(f"\nStep 4: Flushing cache...")
|
||||||
|
flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\nStep 5: Generating without cache (same 164 tokens prefill, no cache)..."
|
||||||
|
)
|
||||||
|
print(f"Using Prompt: {prefix_token_ids}")
|
||||||
|
|
||||||
|
uncached_response = send_single(
|
||||||
|
args,
|
||||||
|
input_ids=prefix_token_ids,
|
||||||
|
max_new_tokens=1,
|
||||||
|
return_full_response=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
uncached_logprobs = uncached_response["meta_info"]["output_token_logprobs"]
|
||||||
|
uncached_token_data = uncached_logprobs[0]
|
||||||
|
uncached_logprob = uncached_token_data[0]
|
||||||
|
uncached_token_id = uncached_token_data[1]
|
||||||
|
|
||||||
|
print(f"✓ Generated without cache:")
|
||||||
|
print(f" Token ID: {uncached_token_id}")
|
||||||
|
print(f" Logprob: {uncached_logprob:.10f}")
|
||||||
|
|
||||||
|
# Step 6: Compare results
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Comparison 1: Decode (Request 1) vs Prefill with Cache (Request 2)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Compare first request (decode) vs second request (prefill with cache)
|
||||||
|
# We expect them to be different (different kernels)
|
||||||
|
decode_vs_prefill_token_match = expected_token_id == cached_token_id
|
||||||
|
decode_vs_prefill_logprob_match = expected_logprob == cached_logprob
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" Decode token (Request 1): ID={expected_token_id}, logprob={expected_logprob:.10f}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" Prefill w/ cache token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" Token ID match: {'✓ YES' if decode_vs_prefill_token_match else '✗ NO'}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" Logprob match: {'✓ YES' if decode_vs_prefill_logprob_match else '✗ NO'}"
|
||||||
|
)
|
||||||
|
if not decode_vs_prefill_logprob_match:
|
||||||
|
diff = abs(expected_logprob - cached_logprob)
|
||||||
|
print(f" Logprob difference: {diff:.10e}")
|
||||||
|
print(f" Note: We expect these to be DIFFERENT (decode vs prefill kernels)")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(
|
||||||
|
"Comparison 2: Cached Prefill (Request 2) vs Uncached Prefill (Request 3)"
|
||||||
|
)
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Main test: compare cached vs uncached prefill (should be identical)
|
||||||
|
token_match = cached_token_id == uncached_token_id
|
||||||
|
logprob_match = cached_logprob == uncached_logprob
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" Cached prefill token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" Uncached prefill token (Request 3): ID={uncached_token_id}, logprob={uncached_logprob:.10f}"
|
||||||
|
)
|
||||||
|
print(f" Token ID match: {'✓ YES' if token_match else '✗ NO'}")
|
||||||
|
if not token_match:
|
||||||
|
print(f" Cached: {cached_token_id}")
|
||||||
|
print(f" Uncached: {uncached_token_id}")
|
||||||
|
|
||||||
|
print(f" Logprob match: {'✓ YES' if logprob_match else '✗ NO'}")
|
||||||
|
if not logprob_match:
|
||||||
|
print(f" Cached: {cached_logprob:.10f}")
|
||||||
|
print(f" Uncached: {uncached_logprob:.10f}")
|
||||||
|
diff = abs(cached_logprob - uncached_logprob)
|
||||||
|
print(f" Difference: {diff:.10e}")
|
||||||
|
print(f" Note: We expect these to be IDENTICAL (both prefill kernels)")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
if token_match and logprob_match:
|
||||||
|
print("✓✓✓ TEST PASSED - Radix cache is consistent! ✓✓✓")
|
||||||
|
return [1]
|
||||||
|
else:
|
||||||
|
print("✗✗✗ TEST FAILED - Radix cache produces different results! ✗✗✗")
|
||||||
|
return [0]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid test mode: {args.test_mode}")
|
raise ValueError(f"Invalid test mode: {args.test_mode}")
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ class TestLMHeadFP32(unittest.TestCase):
|
|||||||
raise unittest.SkipTest("needs CUDA GPU")
|
raise unittest.SkipTest("needs CUDA GPU")
|
||||||
|
|
||||||
def _make_logprocessor(self, vocab_size, enable_fp32):
|
def _make_logprocessor(self, vocab_size, enable_fp32):
|
||||||
ServerArgs.__post_init__ = lambda self: None # disable validation
|
|
||||||
set_global_server_args_for_scheduler(ServerArgs(model_path="dummy"))
|
set_global_server_args_for_scheduler(ServerArgs(model_path="dummy"))
|
||||||
get_global_server_args().enable_dp_lm_head = False
|
get_global_server_args().enable_dp_lm_head = False
|
||||||
get_global_server_args().enable_fp32_lm_head = enable_fp32
|
get_global_server_args().enable_fp32_lm_head = enable_fp32
|
||||||
|
|||||||
@@ -66,10 +66,10 @@ suites = {
|
|||||||
TestFile("rl/test_update_weights_from_disk.py", 114),
|
TestFile("rl/test_update_weights_from_disk.py", 114),
|
||||||
TestFile("rl/test_update_weights_from_tensor.py", 48),
|
TestFile("rl/test_update_weights_from_tensor.py", 48),
|
||||||
TestFile("test_abort.py", 51),
|
TestFile("test_abort.py", 51),
|
||||||
|
TestFile("test_build_eagle_tree.py", 8),
|
||||||
TestFile("test_chunked_prefill.py", 313),
|
TestFile("test_chunked_prefill.py", 313),
|
||||||
TestFile("test_create_kvindices.py", 2),
|
TestFile("test_create_kvindices.py", 2),
|
||||||
TestFile("test_deterministic.py", 300),
|
TestFile("test_deterministic.py", 300),
|
||||||
TestFile("test_build_eagle_tree.py", 8),
|
|
||||||
TestFile("test_eagle_infer_a.py", 370),
|
TestFile("test_eagle_infer_a.py", 370),
|
||||||
TestFile("test_eagle_infer_b.py", 700),
|
TestFile("test_eagle_infer_b.py", 700),
|
||||||
TestFile("test_eagle_infer_beta.py", 300),
|
TestFile("test_eagle_infer_beta.py", 300),
|
||||||
@@ -158,12 +158,17 @@ suites = {
|
|||||||
TestFile("test_multi_instance_release_memory_occupation.py", 64),
|
TestFile("test_multi_instance_release_memory_occupation.py", 64),
|
||||||
TestFile("test_pp_single_node.py", 481),
|
TestFile("test_pp_single_node.py", 481),
|
||||||
],
|
],
|
||||||
"per-commit-8-gpu": [
|
"per-commit-8-gpu-h200": [
|
||||||
TestFile("lora/test_lora_llama4.py", 400),
|
TestFile("lora/test_lora_llama4.py", 400),
|
||||||
TestFile("test_deepseek_v3_basic.py", 275),
|
TestFile("test_deepseek_v3_basic.py", 275),
|
||||||
TestFile("test_deepseek_v3_mtp.py", 275),
|
TestFile("test_deepseek_v3_mtp.py", 275),
|
||||||
TestFile("test_disaggregation_hybrid_attention.py", 200),
|
TestFile("test_disaggregation_hybrid_attention.py", 200),
|
||||||
],
|
],
|
||||||
|
"per-commit-8-gpu-h20": [
|
||||||
|
TestFile("quant/test_w4a8_deepseek_v3.py", 371),
|
||||||
|
TestFile("test_disaggregation_different_tp.py", 600),
|
||||||
|
TestFile("test_disaggregation_pp.py", 140),
|
||||||
|
],
|
||||||
"per-commit-4-gpu-b200": [
|
"per-commit-4-gpu-b200": [
|
||||||
# TestFile("test_gpt_oss_4gpu.py", 600),
|
# TestFile("test_gpt_oss_4gpu.py", 600),
|
||||||
# TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
|
# TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
|
||||||
@@ -172,17 +177,12 @@ suites = {
|
|||||||
TestFile("ep/test_deepep_small.py", 531),
|
TestFile("ep/test_deepep_small.py", 531),
|
||||||
TestFile("ep/test_mooncake_ep_small.py", 450),
|
TestFile("ep/test_mooncake_ep_small.py", 450),
|
||||||
],
|
],
|
||||||
"per-commit-8-gpu-deepep": [
|
"per-commit-8-gpu-h200-deepep": [
|
||||||
TestFile("ep/test_deepep_large.py", 338),
|
TestFile("ep/test_deepep_large.py", 338),
|
||||||
],
|
],
|
||||||
"per-commit-8-gpu-deepseek-v32": [
|
"per-commit-8-gpu-h200-deepseek-v32": [
|
||||||
TestFile("test_deepseek_v32_basic.py", 275),
|
TestFile("test_deepseek_v32_basic.py", 275),
|
||||||
],
|
],
|
||||||
"per-commit-8-gpu-h20": [
|
|
||||||
TestFile("test_disaggregation_different_tp.py", 600),
|
|
||||||
TestFile("test_disaggregation_pp.py", 140),
|
|
||||||
TestFile("quant/test_w4a8_deepseek_v3.py", 371),
|
|
||||||
],
|
|
||||||
"vllm_dependency_test": [
|
"vllm_dependency_test": [
|
||||||
TestFile("quant/test_awq.py", 163),
|
TestFile("quant/test_awq.py", 163),
|
||||||
TestFile("test_bnb.py", 5),
|
TestFile("test_bnb.py", 5),
|
||||||
|
|||||||
Reference in New Issue
Block a user