Multi-node Tensor Parallelism (#550)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
2024-06-17 20:41:24 -07:00
parent 53a7ebd89a
commit 09593e9bc9
10 changed files with 167 additions and 46 deletions
--- a/benchmark/latency_throughput/README.md
+++ b/benchmark/latency_throughput/README.md
@@ -20,7 +20,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat

 ```
 # run synthetic
-python3 synthetic_benchmark.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 --port 30000
+python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 --port 30000
 ```


@@ -36,7 +36,7 @@ python3 bench_throughput.py --backend vllm --tokenizer meta-llama/Llama-2-7b-cha

 ```
 # run synthetic
-python3 synthetic_benchmark.py --backend vllm --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 --port 30000
+python3 bench_throughput.py --backend vllm --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 --port 30000
 ```


--- a/benchmark/latency_throughput/test_latency.py
+++ b/benchmark/latency_throughput/test_latency.py
@@ -24,7 +24,7 @@ if __name__ == "__main__":
            raise ValueError(f"Invalid backend: {args.backend}")

    url = f"{args.host}:{args.port}"
-    a = random.randint(0, 1 << 20)
+    a = 20
    max_new_tokens = 256
    prompt = f"{a, }"