Update benchmark scripts (#8)

2024-01-15 16:12:57 -08:00
parent 01ca82d765
commit 70359bf31a
28 changed files with 183 additions and 50 deletions
--- a/benchmark/dspy/README.md
+++ b/benchmark/dspy/README.md
@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
 ```
 docker run --name tgi --rm -ti --gpus all --network host \
  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
-  ghcr.io/huggingface/text-generation-inference:1.1.0 \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
  --max-input-length 2048 --max-total-tokens 4096 \
  --port 24000
--- a/benchmark/hellaswag/bench_other.py
+++ b/benchmark/hellaswag/bench_other.py
@@ -57,6 +57,8 @@ def main(args):
            out = model + context + select(choices, name="answer")
            return choices.index(out["answer"])

+        call_select("Hello,", ["world", "earth"])
+
    elif args.backend == "lmql":
        import lmql
        model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
@@ -135,6 +137,6 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--num-shot", type=int, default=20)
    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
-    parser.add_argument("--num-questions", type=int, default=100)
+    parser.add_argument("--num-questions", type=int, default=200)
    args = add_common_other_args_and_parse(parser)
    main(args)
--- a/benchmark/hellaswag/bench_sglang.py
+++ b/benchmark/hellaswag/bench_sglang.py
@@ -91,6 +91,6 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--num-shot", type=int, default=20)
    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
-    parser.add_argument("--num-questions", type=int, default=100)
+    parser.add_argument("--num-questions", type=int, default=200)
    args = add_common_sglang_args_and_parse(parser)
    main(args)
--- a/benchmark/json_decode_regex/README.md
+++ b/benchmark/json_decode_regex/README.md
@@ -17,14 +17,13 @@ outlines                  0.0.22

 ### Benchmark sglang

-Run llama-7b
+Run Llama-7B

 ```
 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 
 ```

-Run mixtral-8x7b
-(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
+Run Mixtral-8x7B

 ```
 python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10

 ### Benchmark vllm

-Run llama-7b
+Run Llama-7B

 ```
 python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10

 ### Benchmark guidance

-Run llama-7b and benchmark
+Run Llama-7B and benchmark

 ```
 python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
-```
+```
--- a/benchmark/json_decode_regex/bench_other.py
+++ b/benchmark/json_decode_regex/bench_other.py
@@ -105,7 +105,7 @@ def main(args):

    with open(args.result_file, "a") as fout:
        value = {
-            "task": "json_regex_decode",
+            "task": "json_decode_regex",
            "backend": args.backend,
            "num_gpus": 1,
            "latency": round(latency, 3),
--- a/benchmark/json_decode_regex/bench_sglang.py
+++ b/benchmark/json_decode_regex/bench_sglang.py
@@ -64,8 +64,6 @@ def main(args):
    # Run requests
    tic = time.time()
    states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
-    for state in states:
-        state.sync()
    latency = time.time() - tic

    # Compute accuracy
@@ -80,7 +78,7 @@ def main(args):

    with open(args.result_file, "a") as fout:
        value = {
-            "task": "json_regex_decode",
+            "task": "json_decode_regex",
            "backend": args.backend,
            "num_gpus": 1,
            "latency": round(latency, 3),
--- a/benchmark/json_decode_regex/build_dataset.py
+++ b/benchmark/json_decode_regex/build_dataset.py
--- a/benchmark/latency_throughput/README.md
+++ b/benchmark/latency_throughput/README.md
@@ -3,19 +3,6 @@
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```

-### Performance
-
- Model: Llama-2-7b-chat-hf
- `--num-prompts 2000 --request-rate 200`
- On 4 A10 (24G) GPUs
-
-| Backend     | Throughput      | Latency  |
-| ----------- | --------------- | -------- |
-| srt         | 5.82 requests/s | 343.54 s |
-| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
-| vllm==0.2.7 | 5.02 requests/s | 398.25 s |
-
- 
 ### SGLang
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat

 ### vLLM
 ```
-python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
+python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 --port 21000
 ```

 ```
--- a/benchmark/llm_judge/articles.jsonl
+++ b/benchmark/llm_judge/articles.jsonl
--- a/benchmark/mmlu/bench_other.py
+++ b/benchmark/mmlu/bench_other.py
@@ -95,6 +95,9 @@ def evaluate(args, subject, dev_df, test_df):
                max_tokens=max_tokens, temperature=0)
            return out["answer"]

+        # warmup
+        call_generate("Hello,", temperature=1.0, max_tokens=8)
+
    elif args.backend == "lmql":
        import lmql
        model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
--- a/benchmark/multi_turn_chat/README.md
+++ b/benchmark/multi_turn_chat/README.md
@@ -1,12 +1,12 @@
 ### Benchmark sglang

-Run llama-7b
+Run Llama-7B

 ```
 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
 ```

-Run mixtral-8x7b
+Run Mixtral-8x7B
 (When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)

 ```
@@ -27,13 +27,13 @@ python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long

 ### Benchmark vLLM

-Run llama-7b
+Run Llama-7B

 ```
 python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
 ```

-Run mixtral-8x7b
+Run Mixtral-8x7B

 ```
 python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
@@ -53,14 +53,14 @@ python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm

 ### Benchmark guidance

-Benchmark llama-7b(short output)
+Benchmark Llama-7B (short output)

 ```
 python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1
 ```

-Benchmark llama-7b(long output)
+Benchmark Llama-7B (long output)

 ```
 python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long
-```
+```
--- a/benchmark/multi_turn_chat/bench_other.py
+++ b/benchmark/multi_turn_chat/bench_other.py
@@ -99,7 +99,7 @@ def main(args):

    with open(args.result_file, "a") as fout:
        value = {
-            "task": "multi_turns",
+            "task": "multi_turn_chat",
            "backend": args.backend,
            "num_gpus": 1,
            "latency": round(latency, 3),
--- a/benchmark/multi_turn_chat/bench_sglang.py
+++ b/benchmark/multi_turn_chat/bench_sglang.py
@@ -21,8 +21,6 @@ def multi_turns(s, qas):


 def main(args):
-    print(args)
-
    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)

    multi_qas = gen_arguments(args, tokenizer)
@@ -33,8 +31,6 @@ def main(args):
    states = multi_turns.run_batch(
        multi_qas, temperature=0, backend=backend, num_threads=args.parallel
    )
-    for state in states:
-        state.sync()
    latency = time.time() - tic

    print(f"Latency: {latency:.3f}")
@@ -43,7 +39,7 @@ def main(args):

    with open(args.result_file, "a") as fout:
        value = {
-            "task": "multi_turns",
+            "task": "multi_turn_chat",
            "backend": args.backend,
            "num_gpus": 1,
            "latency": round(latency, 3),
@@ -74,4 +70,6 @@ if __name__ == "__main__":
        args.min_len_a = 256
        args.max_len_a = 512
        args.num_qa = 20
+
+    print(args)
    main(args)
--- a/benchmark/multi_turn_chat/data_gen.py
+++ b/benchmark/multi_turn_chat/data_gen.py
--- a/benchmark/react/README.md
+++ b/benchmark/react/README.md
@@ -1,5 +1,7 @@
 ## Run benchmark

+NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.
+
 ### Benchmark sglang
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
--- a/benchmark/react/bench_other.py
+++ b/benchmark/react/bench_other.py
@@ -124,6 +124,9 @@ def main(args):
            ))
            return out["result"]

+        # warmup
+        call_generate("Hello,", 1.0, 8, ".")
+
    else:
        raise ValueError(f"Invalid backend: {args.backend}")

--- a/benchmark/react/bench_sglang.py
+++ b/benchmark/react/bench_sglang.py
@@ -82,9 +82,10 @@ Action 3: Finish[yes]
 """ + question)
    for i in range(1, len(triplets) + 2):
        s += "Thought " + str(i) + ":"
+        # NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
        ss = s.fork(1)
        ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
-        # ss.join()
+        ss.join()
        # to verify the correctness of output, this should be collected
        # print(ss[0]["thought_action"])
        if i > len(triplets):
--- a/benchmark/react/hotpotqa_100.jsonl
+++ b/benchmark/react/hotpotqa_100.jsonl
--- a/benchmark/tree_of_thought_deep/README.md
+++ b/benchmark/tree_of_thought_deep/README.md
@@ -5,13 +5,15 @@ wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_sch

 ## Run benchmark

+NOTE: This is an implementation for throughput/latency benchmark purposes. The prompts are not tuned to achieve good accuracy on the GSM-8K tasks.
+
 ### Benchmark sglang
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
 ```

 ```
-python3 bench_sglang.py --num-questions 32 --parallel 8
+python3 bench_sglang.py --num-questions 32
 python3 bench_sglang.py --num-questions 16 --parallel 1
 ```

--- a/benchmark/tree_of_thought_deep/bench_other.py
+++ b/benchmark/tree_of_thought_deep/bench_other.py
@@ -141,6 +141,9 @@ def main(args):
                    rets.append(out["answer"])
                return rets

+        # warmup
+        call_generate("Hello,", 1.0, 8, ".", 1)
+
    # Run requests
    states = [None] * len(questions)
    def get_one_answer(i):
--- a/benchmark/tree_of_thought_v0/README.md
+++ b/benchmark/tree_of_thought_v0/README.md
--- a/benchmark/tree_of_thought_v0/bench_other.py
+++ b/benchmark/tree_of_thought_v0/bench_other.py
--- a/benchmark/tree_of_thought_v0/bench_sglang.py
+++ b/benchmark/tree_of_thought_v0/bench_sglang.py
--- a/docs/flashinfer.md
+++ b/docs/flashinfer.md
@@ -1,20 +1,23 @@
 ## Flashinfer Mode

-[`flashinfer`](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving; we use it here to support our attention computation.
+[flashinfer](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving.
+It can be used in SGLang runtime to accelerate attention computation.

 ### Install flashinfer

+Note: The compilation can take a very long time.
+
 ```bash
 git submodule update --init --recursive
 pip install 3rdparty/flashinfer/python
 ```

-### Run Sever With Flashinfer Mode
+### Run a Server With Flashinfer Mode

-Add through `--model_mode` argument from the command line.
+Add `--model-mode flashinfer` argument to enable flashinfer when launching a server.

 Example:

 ```bash
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --model-mode flashinfer
-```
+```
--- a/python/sglang/srt/models/mixtral.py
+++ b/python/sglang/srt/models/mixtral.py
@@ -351,7 +351,7 @@ class MixtralForCausalLM(nn.Module):

        params_dict = dict(self.named_parameters())
        for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, load_format, revision, fall_back_to_pt=False
+            model_name_or_path, cache_dir, load_format, revision
        ):
            if "rotary_emb.inv_freq" in name:
                continue
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -93,7 +93,8 @@ class ServerArgs:
            type=str,
            default=[],
            nargs="+",
-            help="Model mode: [flashinfer, no-cache, aggressive-new-fill]",
+            choices=["flashinfer", "no-cache"],
+            help="Model mode: [flashinfer, no-cache]",
        )
        parser.add_argument(
            "--schedule-heuristic",
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -99,7 +99,7 @@ def call_select_vllm(context, choices, url):
        }
        res = requests.post(url, json=data)
        assert res.status_code == 200
-        scores.append(res.json()["prompt_score"])
+        scores.append(res.json().get("prompt_score", 0))
    return np.argmax(scores)

    """
@@ -112,7 +112,7 @@ def call_select_vllm(context, choices, url):


 def add_common_other_args_and_parse(parser):
-    parser.add_argument("--parallel", type=int, default=96)
+    parser.add_argument("--parallel", type=int, default=64)
    parser.add_argument("--host", type=str, default="http://127.0.0.1")
    parser.add_argument("--port", type=int, default=None)
    parser.add_argument(
--- a/scripts/launch_tgi.sh
+++ b/scripts/launch_tgi.sh
@@ -0,0 +1,6 @@
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000