HiCache, add bench long context plus minor fixs (#9086)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-11 16:54:52 -07:00
parent ff1f68252c
commit 0eec4cb6cc
4 changed files with 111 additions and 16 deletions
--- a/benchmark/hicache/bench_long_context.py
+++ b/benchmark/hicache/bench_long_context.py
@@ -0,0 +1,96 @@
+import json
+import queue
+import time
+
+import requests
+from bench_multiturn import (
+    ReadyQueue,
+    WorkloadGenerator,
+    gen_payload,
+    log_to_jsonl_file,
+    parse_args,
+)
+from tqdm.asyncio import tqdm
+
+from sglang.bench_serving import get_tokenizer
+
+
+class ContextWorkloadGenerator(WorkloadGenerator):
+    def __init__(self, args):
+        # Construct the base URL for requests
+        self.baseurl = f"http://{args.host}:{args.port}/"
+        self.url = self.baseurl + "generate"
+
+        self.tokenizer = get_tokenizer(args.model_path)
+        self.distribution = args.distribution
+        self.request_rate = args.request_rate
+        self.start_time = None
+        self.finished_time = None
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.dataset = json.load(open(args.dataset_path))
+
+        init_requests = []
+        for i in range(min(args.num_clients, len(self.dataset["queries"]))):
+            context_id = self.dataset["queries"][i]["context"]
+            init_requests.append(
+                (
+                    i,
+                    gen_payload(
+                        self.dataset["contexts"][context_id]
+                        + self.dataset["queries"][i]["question"],
+                        len(
+                            self.tokenizer(
+                                self.dataset["queries"][i]["reference_answer"]
+                            )["input_ids"]
+                        ),
+                    ),
+                )
+            )
+        self.ready_queue = ReadyQueue(init_requests=init_requests)
+
+        self.response_queue = queue.Queue()
+        self.pbar = tqdm(total=args.num_clients * args.num_rounds)
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "itl": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+        }
+
+        self.max_parallel = args.max_parallel
+        self.logfile = args.log_file
+
+    def response_handler(self):
+        while True:
+            try:
+                client_id, response = self.response_queue.get(
+                    timeout=10
+                )  # Block until response is available
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["itl"].extend(response.itl)
+                self.performance_metrics["latency"].append(response.latency)
+                self.completed_requests += 1
+
+            except queue.Empty:
+                if self.pbar.n == self.pbar.total:
+                    break
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    args.num_rounds = 1
+    args.max_parallel = 128
+    flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
+
+    for request_rate in [24, 16, 12, 8, 4, 2, 1]:
+        args.request_rate = request_rate
+        requests.post(flush_cache_url)
+        time.sleep(1)
+        performance_data = ContextWorkloadGenerator(args).run()
+        log_to_jsonl_file(performance_data, args.log_file, args.tag)
--- a/benchmark/hicache/bench_multiturn.py
+++ b/benchmark/hicache/bench_multiturn.py
@@ -322,6 +322,9 @@ class WorkloadGenerator:
            "prompt_len": [],
            "cached_tokens": [],
        }
+        self.num_rounds = args.num_rounds
+        self.max_parallel = args.max_parallel
+        self.output_length = args.output_length

    async def handle_request(self, item):
        try:
@@ -336,7 +339,7 @@ class WorkloadGenerator:
    def request_sender(self):
        async def request_loop():
            while True:
-                if self.sent_requests - self.completed_requests < args.max_parallel:
+                if self.sent_requests - self.completed_requests < self.max_parallel:
                    new_request = self.ready_queue.pop()
                    if new_request:
                        asyncio.create_task(self.handle_request(new_request))
@@ -382,7 +385,7 @@ class WorkloadGenerator:
                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
                self.completed_requests += 1

-                if self.client_records[client_id]["round"] < args.num_rounds:
+                if self.client_records[client_id]["round"] < self.num_rounds:
                    # append new request to client's history
                    self.client_records[client_id][
                        "history"
@@ -392,7 +395,7 @@ class WorkloadGenerator:
                            client_id,
                            gen_payload(
                                self.client_records[client_id]["history"],
-                                args.output_length,
+                                self.output_length,
                            ),
                        )
                    )
@@ -461,7 +464,7 @@ class WorkloadGenerator:
            f"  Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
        )
        print(f"  Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
-        log_to_jsonl_file(performance_data, args.log_file, tag=args.tag)
+        return performance_data


 if __name__ == "__main__":
@@ -482,4 +485,5 @@ if __name__ == "__main__":
        args.request_rate = rate
        requests.post(flush_cache_url)
        time.sleep(1)
-        WorkloadGenerator(args).run()
+        performance_data = WorkloadGenerator(args).run()
+        log_to_jsonl_file(performance_data, args.log_file, tag=args.tag)
--- a/docs/advanced_features/hyperparameter_tuning.md
+++ b/docs/advanced_features/hyperparameter_tuning.md
@@ -44,9 +44,9 @@ Look for log entries like this:
 [2025-08-11 17:17:03] max_total_num_tokens=665690, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=4096, context_len=65536, available_gpu_mem=13.50 GB
 ```

-Check the `available_gpu_mem` value.  
- If it is between 5–8 GB, the setting is good.  
- If it is too high (e.g., 10 - 20 GB), increase `--mem-fraction-static` to allocate more memory to the KV cache.  
+Check the `available_gpu_mem` value.
+- If it is between 5–8 GB, the setting is good.
+- If it is too high (e.g., 10 - 20 GB), increase `--mem-fraction-static` to allocate more memory to the KV cache.
 - If it is too low, you risk out-of-memory (OOM) errors later, so decrease `--mem-fraction-static`.

 Another straightforward approach is to increase `--mem-fraction-static` in increments of 0.01 until you encounter OOM errors for your workloads.
--- a/python/sglang/srt/mem_cache/hiradix_cache.py
+++ b/python/sglang/srt/mem_cache/hiradix_cache.py
@@ -71,8 +71,10 @@ class HiRadixCache(RadixCache):
        self.tp_group = tp_cache_group
        self.tp_world_size = torch.distributed.get_world_size(group=self.tp_group)
        self.enable_storage = hicache_storage_backend is not None
-        # todo: customizable storage prefetch threshold
+        # todo: customizable storage prefetch threshold and timeout
        self.prefetch_threshold = 256
+        self.prefetch_timeout = 3  # seconds
+        self.prefetch_stop_policy = hicache_storage_prefetch_policy

        self.load_cache_event = threading.Event()
        self.cache_controller = HiCacheController(
@@ -87,13 +89,6 @@ class HiRadixCache(RadixCache):
            prefetch_threshold=self.prefetch_threshold,
        )

-        self.prefetch_stop_policy = hicache_storage_prefetch_policy
-        # todo: customizable storage prefetch timeout
-        self.prefetch_timeout = 3  # seconds
-        logger.info(
-            f"HiCache storage prefetch policy: {hicache_storage_prefetch_policy}"
-        )
-
        # record the nodes with ongoing write through
        self.ongoing_write_through = {}
        # record the node segments with ongoing load back