[router] cache-aware load-balancing router v1 (#2114)

2024-11-23 08:34:48 -08:00
parent ad47749b82
commit cbedd1db1d
17 changed files with 1963 additions and 602 deletions
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -727,9 +727,9 @@ def sample_generated_shared_prefix_requests(
    total_input_tokens = 0
    total_output_tokens = 0

-    for group_idx in range(num_groups):
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
        system_prompt = system_prompts[group_idx]
-        for prompt_idx in range(prompts_per_group):
+        for prompt_idx in tqdm(range(prompts_per_group), desc="Generating questions"):
            question = questions[group_idx * prompts_per_group + prompt_idx]
            full_prompt = f"{system_prompt}\n\n{question}"
            prompt_len = len(tokenizer.encode(full_prompt))
--- a/python/sglang/test/few_shot_gsm8k.py
+++ b/python/sglang/test/few_shot_gsm8k.py
@@ -48,9 +48,13 @@ def run_eval(args):
    # Select backend
    set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))

-    # Read data
-    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
-    filename = download_and_cache_file(url)
+    if args.data_path is None:
+        # Read data
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        filename = download_and_cache_file(url)
+    else:
+        filename = args.data_path
+
    lines = list(read_jsonl(filename))

    # Construct prompts