release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com>
Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com>
Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu>
Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com>
Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
Lianmin Zheng
2024-01-08 04:37:50 +00:00
parent f6d40df0ee
commit 22085081bb
145 changed files with 17802 additions and 2 deletions

View File

@@ -0,0 +1,47 @@
## Run benchmark
### Benchmark sglang
```
python3 -m sglang.launch_server --model-path codellama/CodeLlama-7b-instruct-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 10 --parallel 1
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model codellama/CodeLlama-7b-instruct-hf --disable-log-requests --port 21000 --gpu 0.97
```
```
python3 bench_other.py --backend vllm --num-questions 64
```
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 32 --parallel 1
```
### Build dataset
```
pip install PyPDF2
python3 build_dataset.py
```
```python
import PyPDF2
with open('llama2.pdf', 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text()
with open('output.txt', 'w') as text_file:
text_file.write(text)
```

View File

@@ -0,0 +1,126 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from tqdm import tqdm
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
USER_PREFIX = "[INST] "
USER_SUFFIX = " [/INST]"
ASSISTANT_PREFIX = ""
ASSISTANT_SUFFIX = " </s><s>"
def multi_document_qa(docs, question, generate):
s = USER_PREFIX
s += "Pleaes answer a question according to given documents.\n"
s += "Question:" + question + "Documents begin.\n"
s += "".join(docs)
s += "\nDocuments end."
s += ("\n\nBased on the above documents, please answer this question:\n" + question + "\nAnswer in three words or fewer.")
s += USER_SUFFIX
s += ASSISTANT_PREFIX
answer = generate(s, max_tokens=16, stop=None)
return answer
def main(args):
lines = read_jsonl(args.data_path)
l = lines[0]
arguments = []
labels = []
num_docs = 10
if args.backend == "guidance":
num_docs = 7 # due to OOM
for i in range(len(l["questions"][:args.num_questions])):
arguments.append({
"docs": l["documents"][:num_docs],
"question": l["questions"][i],
})
labels.append(l["answers"][i])
states = [None] * len(arguments)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
model = models.LlamaCpp("/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf", n_gpu_layers=-1, n_ctx=11000)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
return out["answer"]
# warmup
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests
def get_one_answer(i):
states[i] = multi_document_qa(generate=generate, **arguments[i])
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(labels))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(labels))))
latency = time.time() - tic
# Compute accuracy
print(states)
correct = 0
for s, label in zip(states, labels):
answer = s.lower()
if all(x in answer for x in label.lower().split(" ")):
correct += 1
accuracy = correct / len(labels)
print(f"Accuracy: {accuracy:.3f}")
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "multi_document_qa",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"accuracy": accuracy,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="questions.jsonl")
parser.add_argument("--num-questions", type=int, default=100)
args = add_common_other_args_and_parse(parser)
main(args)

View File

@@ -0,0 +1,84 @@
import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
@sgl.function
def multi_document_qa(s, docs, question):
s += sgl.user_begin()
s += "Pleaes answer a question according to given documents.\n"
s += "Question:" + question + "Documents begin.\n"
forks = s.fork(len(docs))
forks += lambda i: docs[i]
forks.join("concate_and_append")
s += "\nDocuments end."
s += ("\n\nBased on the above documents, please answer this question:\n" + question + "\nAnswer in three words or fewer.")
s += sgl.user_end()
s += sgl.assistant(sgl.gen("answer", max_tokens=16))
def main(args):
lines = read_jsonl(args.data_path)
l = lines[0]
arguments = []
labels = []
for i in range(len(l["questions"][:args.num_questions])):
arguments.append({
"docs": l["documents"][:10],
"question": l["questions"][i],
})
labels.append(l["answers"][i])
# Select backend
backend = select_sglang_backend(args)
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
states = multi_document_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel)
latency = time.time() - tic
# Compute accuracy
print([s["answer"] for s in states])
correct = 0
for s, label in zip(states, labels):
answer = s["answer"].lower()
if all(x in answer for x in label.lower().split(" ")):
correct += 1
accuracy = correct / len(labels)
print(f"Accuracy: {accuracy:.3f}")
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "multi_document_qa",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"accuracy": accuracy,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="questions.jsonl")
parser.add_argument("--num-questions", type=int, default=100)
args = add_common_sglang_args_and_parse(parser)
main(args)

View File

@@ -0,0 +1,64 @@
import json
import transformers
content = "\n".join(
open("llama2.txt", 'r', encoding='utf-8', errors='ignore').readlines())
content = content.replace("\n\n", "\n")
# Count token
name = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(name)
print(f"num tokens: {len(t.encode(content))}")
# Segment
SEP = "\n\n"
parts = content.split(SEP)
print(f"num segments: {len(parts)}")
segment_len = 1100
segments = []
tmp = []
tmp_len = 0
for i in range(len(parts)):
tmp.append(parts[i])
tmp_len += len(t.encode(parts[i]))
if tmp_len > segment_len:
segments.append(SEP.join(tmp))
tmp = []
tmp_len = 0
for i, s in enumerate(segments):
print(i, len(t.encode(segments[i])))
# Dump
with open("questions.jsonl", "w") as fout:
fout.write(json.dumps({
"documents": segments[:30],
"questions": [
"What is the name of the fine-tuned LLMs?",
"Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
"What is the number of parameters in the largest Llama 2 model?",
"What is the batch size of fine-tuning?",
"Where can we find the details of potential data contamination?",
"What is the full name of MPT?",
"What is the power consumption of RSC in Watt?",
"How many tokens of data do they train on?",
"Which model's release is delayed due to a lack of time to sufficiently red team?",
"Which activation function is used in Llama?"
],
"answers": [
"Llama 2 Chat",
"1",
"70 B",
"64",
"A 6",
"MosaicML",
"400",
"2 trillion",
"34 B",
"SwiGLU",
],
}) + "\n")