diff --git a/benchmark/hicache/bench_multiturn.py b/benchmark/hicache/bench_multiturn.py index a3e8b0d74..00b217217 100644 --- a/benchmark/hicache/bench_multiturn.py +++ b/benchmark/hicache/bench_multiturn.py @@ -105,12 +105,16 @@ def parse_args(): action="store_true", help="If set, disable automatically testing with a range of request rates.", ) - parser.add_argument( "--disable-random-sample", action="store_true", help="If set, disable random sampling of requests from the ShareGPT dataset.", ) + parser.add_argument( + "--enable-round-barrier", + action="store_true", + help="If set, only send i-th turn requests after all (i-1)-th turn requests finished.", + ) parser.add_argument( "--sub-question-input-length", type=int, @@ -335,6 +339,19 @@ class WorkloadGenerator: "cached_tokens": [], "generated_len": [], } + self.enable_round_barrier = args.enable_round_barrier + if self.enable_round_barrier: + # Add round-specific metrics while preserving the original structure + for i in range(args.num_rounds): + self.performance_metrics[f"round_{i}"] = { + "ttft": [], + "latency": [], + "prompt_len": [], + "cached_tokens": [], + "generated_len": [], + } + self.num_clients = args.num_clients + self.num_rounds = args.num_rounds self.max_parallel = args.max_parallel self.output_length = args.output_length @@ -383,6 +400,7 @@ class WorkloadGenerator: loop.close() def response_handler(self): + next_round_reqs = [] while True: try: client_id, response = self.response_queue.get( @@ -391,12 +409,29 @@ class WorkloadGenerator: if not response.success: raise ValueError(f"Request failed with error: {response.error}") self.client_records[client_id]["history"] += response.generated_text + current_round = self.client_records[client_id]["round"] self.client_records[client_id]["round"] += 1 self.performance_metrics["ttft"].append(response.ttft) self.performance_metrics["latency"].append(response.latency) self.performance_metrics["prompt_len"].append(response.prompt_len) self.performance_metrics["cached_tokens"].append(response.cached_tokens) self.performance_metrics["generated_len"].append(response.generated_len) + if self.enable_round_barrier: + self.performance_metrics[f"round_{current_round}"]["ttft"].append( + response.ttft + ) + self.performance_metrics[f"round_{current_round}"][ + "latency" + ].append(response.latency) + self.performance_metrics[f"round_{current_round}"][ + "prompt_len" + ].append(response.prompt_len) + self.performance_metrics[f"round_{current_round}"][ + "cached_tokens" + ].append(response.cached_tokens) + self.performance_metrics[f"round_{current_round}"][ + "generated_len" + ].append(response.generated_len) self.completed_requests += 1 if self.client_records[client_id]["round"] < self.num_rounds: @@ -404,16 +439,22 @@ class WorkloadGenerator: self.client_records[client_id][ "history" ] += self.sub_question_inputs.pop().prompt - self.ready_queue.append( - ( - client_id, - gen_payload( - self.client_records[client_id]["history"], - self.output_length, - args.lora_path, - ), - ) + new_req = ( + client_id, + gen_payload( + self.client_records[client_id]["history"], + self.output_length, + args.lora_path, + ), ) + if self.enable_round_barrier: + next_round_reqs.append(new_req) + if len(next_round_reqs) == self.num_clients: + for req in next_round_reqs: + self.ready_queue.append(req) + next_round_reqs = [] + else: + self.ready_queue.append(new_req) except queue.Empty: if self.pbar.n == self.pbar.total: break @@ -469,6 +510,25 @@ class WorkloadGenerator: ), }, } + if self.enable_round_barrier: + performance_data["round"] = {} + for round_num in range(args.num_rounds): + round_key = f"round_{round_num}" + round_metrics = self.performance_metrics[round_key] + performance_data["round"][round_key] = { + "average_ttft": ( + sum(round_metrics["ttft"]) / len(round_metrics["ttft"]) + if round_metrics["ttft"] + else 0 + ), + "cache_hit_rate": ( + 0 + if sum(round_metrics["prompt_len"]) == 0 + else sum(round_metrics["cached_tokens"]) + / sum(round_metrics["prompt_len"]) + ), + "request_count": len(round_metrics["ttft"]), + } print("All requests completed") print("Performance metrics summary:") print( @@ -492,6 +552,26 @@ class WorkloadGenerator: f" Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second" ) print(f" Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}") + + if self.enable_round_barrier: + # Print round-basedsummary + print("Per-round metrics:") + if "round" in performance_data: + for round_num in range(self.num_rounds): + round_key = f"round_{round_num}" + if round_key in performance_data["round"]: + round_data = performance_data["round"][round_key] + avg_ttft = round_data["average_ttft"] + cache_hit_rate = round_data["cache_hit_rate"] + request_count = round_data["request_count"] + print( + f" Round {round_num}: Average TTFT = {avg_ttft:.2f}s, " + f"Cache Hit Rate = {cache_hit_rate:.6f} " + f"({request_count} requests)" + ) + else: + print(f" Round {round_num}: No requests completed") + return performance_data diff --git a/python/sglang/srt/mem_cache/storage/mooncake_store/README.md b/python/sglang/srt/mem_cache/storage/mooncake_store/README.md index 2757ea2f9..5385d5b25 100644 --- a/python/sglang/srt/mem_cache/storage/mooncake_store/README.md +++ b/python/sglang/srt/mem_cache/storage/mooncake_store/README.md @@ -66,15 +66,22 @@ python -m mooncake.http_metadata_server **Launch Mooncake `master service`:** ```bash -mooncake_master +mooncake_master --eviction_high_watermark_ratio=0.95 ``` To start both the metadata and master services together: ```bash -mooncake_master --enable_http_metadata_server=true +mooncake_master --enable_http_metadata_server=true --eviction_high_watermark_ratio=0.95 ``` -**Launch Mooncake `store service`:** +**Understanding `eviction_high_watermark_ratio`:** + +When a `PutStart` request fails due to insufficient memory, or when the eviction thread detects that space usage has reached the configured high watermark ratio, an eviction task is triggered to free up space by evicting a portion of objects. + +Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator_benchmark_result.html) + provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly. + +**Launch Mooncake `store service` (Optional):** First, create and save a configuration file in JSON format. For example: @@ -106,9 +113,10 @@ Then start the `store service`: python -m mooncake.mooncake_store_service --config=[config_path] ``` -Note: To get started quickly, if `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also fulfills the role of the `store service`. +Note: If `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also takes on the role of the `store service`, which simplifies deployment but couples the two components together. Users can choose the deployment approach that best fits their needs. **Start the `SGLang server` with Mooncake enabled:** + Mooncake configuration can be provided via environment variables. Note that, for optimal performance, the Mooncake backend currently supports only the `page_first` layout (which optimizes memory access patterns for KV cache operations). There are two ways to configure Mooncake: 1. Using environment variables; 2. Using extra-config of sglang arguments.