Format Benchmark Code (#399)

This commit is contained in:
Liangsheng Yin
2024-04-28 21:06:22 +08:00
committed by GitHub
parent 19818b9c2f
commit 95c4e0dfac
41 changed files with 1169 additions and 608 deletions

View File

@@ -2,6 +2,7 @@
Adapted from
https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/intro.ipynb#L9
"""
import argparse
import dspy
@@ -29,7 +30,7 @@ class RAG(dspy.Module):
self.retrieve = dspy.Retrieve(k=num_passages)
self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
def forward(self, question):
context = self.retrieve(question).passages
prediction = self.generate_answer(context=context, question=question)
@@ -37,29 +38,41 @@ class RAG(dspy.Module):
def main(args):
#lm = dspy.OpenAI(model='gpt-3.5-turbo')
# lm = dspy.OpenAI(model='gpt-3.5-turbo')
if args.backend == "tgi":
lm = dspy.HFClientTGI(model="meta-llama/Llama-2-7b-chat-hf", port=args.port,
url="http://localhost")
lm = dspy.HFClientTGI(
model="meta-llama/Llama-2-7b-chat-hf",
port=args.port,
url="http://localhost",
)
elif args.backend == "sglang":
lm = dspy.HFClientSGLang(model="meta-llama/Llama-2-7b-chat-hf", port=args.port,
url="http://localhost")
lm = dspy.HFClientSGLang(
model="meta-llama/Llama-2-7b-chat-hf",
port=args.port,
url="http://localhost",
)
elif args.backend == "vllm":
lm = dspy.HFClientVLLM(model="meta-llama/Llama-2-7b-chat-hf", port=args.port,
url="http://localhost")
lm = dspy.HFClientVLLM(
model="meta-llama/Llama-2-7b-chat-hf",
port=args.port,
url="http://localhost",
)
else:
raise ValueError(f"Invalid backend: {args.backend}")
colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
colbertv2_wiki17_abstracts = dspy.ColBERTv2(
url="http://20.102.90.50:2017/wiki17_abstracts"
)
dspy.settings.configure(lm=lm, rm=colbertv2_wiki17_abstracts)
# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=args.dev_size,
test_size=0)
dataset = HotPotQA(
train_seed=1, train_size=20, eval_seed=2023, dev_size=args.dev_size, test_size=0
)
# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]
trainset = [x.with_inputs("question") for x in dataset.train]
devset = [x.with_inputs("question") for x in dataset.dev]
print(len(trainset), len(devset))
@@ -72,15 +85,19 @@ def main(args):
print(f"Answer: {dev_example.answer}")
print(f"Relevant Wikipedia Titles: {dev_example.gold_titles}")
print(f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}")
print(f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys {dev_example.labels().keys()}")
print(
f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}"
)
print(
f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys {dev_example.labels().keys()}"
)
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)
# Call the predictor on a particular input.
pred = generate_answer(question=dev_example.question)
# Print the input and the prediction.
print(f"Question: {dev_example.question}")
print(f"Predicted Answer: {pred.answer}")
@@ -89,10 +106,10 @@ def main(args):
# Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged.
generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
# Call the predictor on the same input.
pred = generate_answer_with_chain_of_thought(question=dev_example.question)
# Print the input, the chain of thought, and the prediction.
print(f"Question: {dev_example.question}")
print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
@@ -101,22 +118,26 @@ def main(args):
retrieve = dspy.Retrieve(k=3)
topK_passages = retrieve(dev_example.question).passages
print(f"Top {retrieve.k} passages for question: {dev_example.question} \n", '-' * 30, '\n')
print(
f"Top {retrieve.k} passages for question: {dev_example.question} \n",
"-" * 30,
"\n",
)
for idx, passage in enumerate(topK_passages):
print(f'{idx+1}]', passage, '\n')
print(f"{idx+1}]", passage, "\n")
retrieve("When was the first FIFA World Cup held?").passages[0]
from dspy.teleprompt import BootstrapFewShot
# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
answer_EM = dspy.evaluate.answer_exact_match(example, pred)
answer_PM = dspy.evaluate.answer_passage_match(example, pred)
return answer_EM and answer_PM
# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
@@ -125,10 +146,10 @@ def main(args):
# Ask any question you like to this simple RAG program.
my_question = "What castle did David Gregory inherit?"
# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(my_question)
# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
@@ -137,20 +158,26 @@ def main(args):
from dspy.evaluate.evaluate import Evaluate
# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=args.num_threads, display_progress=True, display_table=5)
evaluate_on_hotpotqa = Evaluate(
devset=devset,
num_threads=args.num_threads,
display_progress=True,
display_table=5,
)
# Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
metric = dspy.evaluate.answer_exact_match
evaluate_on_hotpotqa(compiled_rag, metric=metric)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int)
parser.add_argument("--num-threads", type=int, default=32)
parser.add_argument("--dev-size", type=int, default=150)
parser.add_argument("--backend", type=str, choices=["sglang", "tgi", "vllm"],
default="sglang")
parser.add_argument(
"--backend", type=str, choices=["sglang", "tgi", "vllm"], default="sglang"
)
args = parser.parse_args()
if args.port is None:

View File

@@ -122,16 +122,36 @@ Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs
* Must be one of the "Area options," verbatim.
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
---"""
s += (persona_name + " lives in " + living_sector + " that has " +
living_sector_areas + ".\n")
s += (persona_name + " is currently in " + current_sector + " that has " +
current_sector_areas + ".\n")
s += (
persona_name
+ " lives in "
+ living_sector
+ " that has "
+ living_sector_areas
+ ".\n"
)
s += (
persona_name
+ " is currently in "
+ current_sector
+ " that has "
+ current_sector_areas
+ ".\n"
)
s += daily_plan + ".\n"
s += "Area options: " + sector_options + ".\n"
s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.\n"""
s += (persona_name + " is " + current_action + ". For " + next_action +
", " + persona_name + " should go to the following area: {")
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ " should go to the following area: {"
)
s += sgl.gen(name="Location", max_tokens=10, stop="}")
@@ -162,22 +182,43 @@ Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs
* Must be one of the "Area options," verbatim.
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
---"""
s += (persona_name + " lives in " + living_sector + " that has " +
living_sector_areas + ".\n")
s += (persona_name + " is currently in " + current_sector + " that has " +
current_sector_areas + ".\n")
s += (
persona_name
+ " lives in "
+ living_sector
+ " that has "
+ living_sector_areas
+ ".\n"
)
s += (
persona_name
+ " is currently in "
+ current_sector
+ " that has "
+ current_sector_areas
+ ".\n"
)
s += daily_plan + ".\n"
s += "Area options: " + sector_options + ".\n"
s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim.\n"""
s += (persona_name + " is " + current_action + ". For " + next_action +
", " + persona_name + " should go to the following area: {")
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ " should go to the following area: {"
)
return {"prompt": s, "max_tokens": 10, "stop": "}"}
@sgl.function
def action_location_object(s, persona_name, target_sector, target_sector_areas,
current_action, next_action):
def action_location_object(
s, persona_name, target_sector, target_sector_areas, current_action, next_action
):
s += """
Jane Anderson is in kitchen in Jane Anderson's house.
Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen, bedroom, bathroom}
@@ -191,20 +232,34 @@ Stay in the current area if the activity can be done there. Never go into other
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
Answer: {cafe}
---"""
s += (persona_name + " is going to " + target_sector +
" that has the following areas: {" + target_sector_areas + "}\n")
s += (
persona_name
+ " is going to "
+ target_sector
+ " that has the following areas: {"
+ target_sector_areas
+ "}\n"
)
s += """* Stay in the current area if the activity can be done there.
* NEVER go into other people's rooms unless necessary."""
s += (persona_name + " is " + current_action + ". For " + next_action +
", " + persona_name + "should go to the following area in " +
target_sector)
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ "should go to the following area in "
+ target_sector
)
s += " (MUST pick one of {" + target_sector_areas + "}):\n"
s += "Answer: {" + sgl.gen(name="Area", max_tokens=5, stop="}")
def action_location_object_prompt(persona_name, target_sector,
target_sector_areas, current_action,
next_action):
def action_location_object_prompt(
persona_name, target_sector, target_sector_areas, current_action, next_action
):
s = ""
s += """
Jane Anderson is in kitchen in Jane Anderson's house.
@@ -219,13 +274,27 @@ Stay in the current area if the activity can be done there. Never go into other
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
Answer: {cafe}
---"""
s += (persona_name + " is going to " + target_sector +
" that has the following areas: {" + target_sector_areas + "}\n")
s += (
persona_name
+ " is going to "
+ target_sector
+ " that has the following areas: {"
+ target_sector_areas
+ "}\n"
)
s += """* Stay in the current area if the activity can be done there.
* NEVER go into other people's rooms unless necessary."""
s += (persona_name + " is " + current_action + ". For " + next_action +
", " + persona_name + "should go to the following area in " +
target_sector)
s += (
persona_name
+ " is "
+ current_action
+ ". For "
+ next_action
+ ", "
+ persona_name
+ "should go to the following area in "
+ target_sector
)
s += " (MUST pick one of {" + target_sector_areas + "}):\n"
s += "Answer: {"
return {"prompt": s, "max_tokens": 5, "stop": "}"}

View File

@@ -1,29 +1,29 @@
import argparse
from functools import partial
import json
import time
from functools import partial
from pathlib import Path
from agent_functions import (
action_location_object_prompt,
action_location_sector_prompt,
generate_event_triple_prompt,
generate_pronunciatio_prompt,
poignancy_event_prompt,
)
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_vllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import read_jsonl, dump_state_text
from agent_functions import (
poignancy_event_prompt,
generate_event_triple_prompt,
generate_pronunciatio_prompt,
action_location_sector_prompt,
action_location_object_prompt,
)
from sglang.utils import dump_state_text, read_jsonl
def main(args):
lines = read_jsonl(args.data_path)[:args.num_events]
lines = read_jsonl(args.data_path)[: args.num_events]
mapping = {
"poignancy_event": poignancy_event_prompt,
"generate_event_triple": generate_event_triple_prompt,
@@ -46,7 +46,7 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp(
str(Path.home()) + "/model_weights/Llama-2-7b-chat.gguf",
@@ -55,11 +55,15 @@ def main(args):
)
def call_generate(prompt, temperature, max_tokens, stop):
out = model + prompt + gen(
name="result",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
out = (
model
+ prompt
+ gen(
name="result",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["result"]
@@ -87,7 +91,7 @@ def main(args):
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
# to pack weighted functions as a single agent
# to pack weighted functions as a single agent
"num_requests": len(arguments) / len(mapping),
"other": {
"parallel": args.parallel,

View File

@@ -2,24 +2,24 @@ import argparse
import json
import time
from agent_functions import (
action_location_object,
action_location_sector,
generate_event_triple,
generate_pronunciatio,
poignancy_event,
)
import sglang as sgl
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import read_jsonl, dump_state_text
from agent_functions import (
poignancy_event,
generate_event_triple,
generate_pronunciatio,
action_location_sector,
action_location_object,
)
from sglang.utils import dump_state_text, read_jsonl
def main(args):
lines = read_jsonl(args.data_path)[:args.num_events]
lines = read_jsonl(args.data_path)[: args.num_events]
mapping = {
"poignancy_event": poignancy_event,
"generate_event_triple": generate_event_triple,

View File

@@ -1,23 +1,28 @@
import argparse
import ast
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_one_example(lines, i, include_answer):
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
@@ -32,7 +37,7 @@ def get_few_shot_examples(lines, k):
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -50,7 +55,7 @@ def main(args):
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -68,19 +73,31 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=temperature, stop=stop)
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["answer"]
elif args.backend == "lmql":
import lmql
model = lmql.model(args.model_path,
endpoint=f"{args.host}:{args.port}")
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
@lmql.query(model=model)
async def program(question):
@@ -103,7 +120,8 @@ def main(args):
prompt=few_shot_examples + questions[i],
temperature=0,
max_tokens=256,
stop="Question")
stop="Question",
)
states[i] = answer
tic = time.time()
@@ -118,12 +136,18 @@ def main(args):
async def batched_call(batch_size):
for i in range(0, len(questions), batch_size):
tasks = []
for q in questions[i:i+batch_size]:
tasks.append(call_generate(few_shot_examples + q,
temperature=0, max_tokens=256, stop="Question"))
for q in questions[i : i + batch_size]:
tasks.append(
call_generate(
few_shot_examples + q,
temperature=0,
max_tokens=256,
stop="Question",
)
)
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
states[i+j] = rets[j]
states[i + j] = rets[j]
tic = time.time()
asyncio.run(batched_call(batch_size=args.parallel))
@@ -154,7 +178,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -5,15 +5,18 @@ import re
import time
import numpy as np
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_one_example(lines, i, include_answer):
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
@@ -28,7 +31,7 @@ def get_few_shot_examples(lines, k):
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -46,7 +49,7 @@ def main(args):
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -73,7 +76,12 @@ def main(args):
# Run requests
tic = time.time()
states = few_shot_gsm8k.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
preds = []
@@ -101,7 +109,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,17 +1,22 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
import json
from functools import partial
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_select_lightllm, call_select_vllm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_select_lightllm,
call_select_vllm,
)
from sglang.utils import read_jsonl
def get_one_example(lines, i, include_answer):
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
if include_answer:
ret += lines[i]["endings"][lines[i]["label"]]
return ret
@@ -34,7 +39,7 @@ def main(args):
questions = []
choices = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(get_one_example(lines, i, False))
choices.append(lines[i]["endings"])
labels.append(lines[i]["label"])
@@ -51,7 +56,11 @@ def main(args):
elif args.backend == "guidance":
from guidance import models, select
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_select(context, choices):
out = model + context + select(choices, name="answer")
@@ -61,8 +70,10 @@ def main(args):
elif args.backend == "lmql":
import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
endpoint=f"{args.host}:{args.port}")
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(ctx, choices):
@@ -83,8 +94,8 @@ def main(args):
# Use thread pool
def get_one_answer(i):
preds[i] = call_select(
context=few_shot_examples + questions[i],
choices=choices[i])
context=few_shot_examples + questions[i], choices=choices[i]
)
tic = time.time()
if args.parallel == 1:
@@ -98,13 +109,13 @@ def main(args):
async def batched_call(batch_size):
for i in range(0, len(questions), batch_size):
tasks = []
for q, c in zip(questions[i:i+batch_size], choices[i:i+batch_size]):
tasks.append(call_select(
context=few_shot_examples + q,
choices=c))
for q, c in zip(
questions[i : i + batch_size], choices[i : i + batch_size]
):
tasks.append(call_select(context=few_shot_examples + q, choices=c))
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
preds[i+j] = rets[j]
preds[i + j] = rets[j]
tic = time.time()
asyncio.run(batched_call(batch_size=args.parallel))
@@ -128,7 +139,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -3,12 +3,16 @@ import json
import time
import numpy as np
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import read_jsonl
def get_one_example(lines, i, include_answer):
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
if include_answer:
ret += lines[i]["endings"][lines[i]["label"]]
return ret
@@ -31,21 +35,18 @@ def main(args):
questions = []
choices = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(get_one_example(lines, i, False))
choices.append(lines[i]["endings"])
labels.append(lines[i]["label"])
arguments = [
{"question": q, "choices": c}
for q, c in zip(questions, choices)
]
arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
#####################################
######### SGL Program Begin #########
#####################################
import sglang as sgl
@sgl.function
def few_shot_hellaswag(s, question, choices):
s += few_shot_examples + question
@@ -61,7 +62,12 @@ def main(args):
# Run requests
tic = time.time()
rets = few_shot_hellaswag.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
latency = time.time() - tic
@@ -82,7 +88,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -4,13 +4,14 @@ import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_outlines,
)
from sglang.utils import dump_state_text, read_jsonl
from sglang.lang.ir import REGEX_INT, REGEX_STRING, REGEX_FLOAT
from tqdm import tqdm
REGEX_LIST = r"\[(" + REGEX_STRING + ", )*" + REGEX_STRING + r"\]"

View File

@@ -3,7 +3,7 @@ import json
import time
import sglang as sgl
from sglang.lang.ir import REGEX_INT, REGEX_STRING, REGEX_FLOAT
from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
@@ -63,7 +63,9 @@ def main(args):
# Run requests
tic = time.time()
states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel, progress_bar=True)
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
# Compute accuracy

View File

@@ -5,12 +5,13 @@ from concurrent.futures import ThreadPoolExecutor
from functools import partial
import guidance
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_outlines,
)
from sglang.utils import dump_state_text, read_jsonl
from tqdm import tqdm
# there are some FSM bugs with json regex converted from pydantic model
# here use a string regex instead

View File

@@ -15,16 +15,17 @@ On the client side, run:
--tokenizer <your_model> --dataset <target_dataset> \
--request-rate <request_rate>
"""
import argparse
import asyncio
import json
import random
import time
from typing import AsyncGenerator, List, Tuple
from tqdm.asyncio import tqdm_asyncio
import aiohttp
import numpy as np
from tqdm.asyncio import tqdm_asyncio
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -41,10 +42,7 @@ def sample_requests(
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [
data for data in dataset
if len(data["conversations"]) >= 2
]
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [
(data["conversations"][0]["value"], data["conversations"][1]["value"])
@@ -185,9 +183,17 @@ async def benchmark(
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
task = asyncio.create_task(send_request(backend, api_url, prompt,
prompt_len, output_len,
best_of, use_beam_search))
task = asyncio.create_task(
send_request(
backend,
api_url,
prompt,
prompt_len,
output_len,
best_of,
use_beam_search,
)
)
tasks.append(task)
await tqdm_asyncio.gather(*tasks)
@@ -202,8 +208,16 @@ def main(args: argparse.Namespace):
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
benchmark_start_time = time.perf_counter()
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
args.use_beam_search, args.request_rate))
asyncio.run(
benchmark(
args.backend,
api_url,
input_requests,
args.best_of,
args.use_beam_search,
args.request_rate,
)
)
benchmark_end_time = time.perf_counter()
benchmark_time = benchmark_end_time - benchmark_start_time
print(f"Total time: {benchmark_time:.2f} s")
@@ -212,43 +226,61 @@ def main(args: argparse.Namespace):
# Compute the latency statistics.
avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
print(f"Average latency: {avg_latency:.2f} s")
avg_per_token_latency = np.mean([
latency / (prompt_len + output_len)
for prompt_len, output_len, latency in REQUEST_LATENCY
])
avg_per_token_latency = np.mean(
[
latency / (prompt_len + output_len)
for prompt_len, output_len, latency in REQUEST_LATENCY
]
)
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
avg_per_output_token_latency = np.mean([
latency / output_len
for _, output_len, latency in REQUEST_LATENCY
])
print("Average latency per output token: "
f"{avg_per_output_token_latency:.2f} s")
avg_per_output_token_latency = np.mean(
[latency / output_len for _, output_len, latency in REQUEST_LATENCY]
)
print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument("--backend", type=str, default="vllm",
choices=["vllm", "tgi", "srt", "lightllm"])
description="Benchmark the online serving throughput."
)
parser.add_argument(
"--backend",
type=str,
default="vllm",
choices=["vllm", "tgi", "srt", "lightllm"],
)
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--dataset", type=str, required=True,
help="Path to the dataset.")
parser.add_argument("--tokenizer", type=str, required=True,
help="Name or path of the tokenizer.")
parser.add_argument("--best-of", type=int, default=1,
help="Generates `best_of` sequences per prompt and "
"returns the best one.")
parser.add_argument(
"--dataset", type=str, required=True, help="Path to the dataset."
)
parser.add_argument(
"--tokenizer", type=str, required=True, help="Name or path of the tokenizer."
)
parser.add_argument(
"--best-of",
type=int,
default=1,
help="Generates `best_of` sequences per prompt and " "returns the best one.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts", type=int, default=1000,
help="Number of prompts to process.")
parser.add_argument("--request-rate", type=float, default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.")
parser.add_argument(
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
)
parser.add_argument(
"--request-rate",
type=float,
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code', action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="trust remote code from huggingface",
)
args = parser.parse_args()
main(args)

View File

@@ -1,11 +1,15 @@
import argparse
import json
import time
import re
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text
@@ -35,23 +39,30 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
dst_percent = dst_percents[j]
query_indices = line_obj["group_by_num_hoops"][str(num_hoops)]
query_indices = [q for q in query_indices if
all(l <= src_index for l in line_obj["links"][q]) and q < src_index]
dst_index = query_indices[min(int(len(query_indices) * dst_percent), len(query_indices)-1)]
query_indices = [
q
for q in query_indices
if all(l <= src_index for l in line_obj["links"][q]) and q < src_index
]
dst_index = query_indices[
min(int(len(query_indices) * dst_percent), len(query_indices) - 1)
]
label = line_obj["values"][dst_index]
body = line_obj["lines"][:src_index+1]
body = line_obj["lines"][: src_index + 1]
suffix = line_obj["suffix"].replace("???", line_obj["indices"][dst_index])
body_part_len = len(body) // 4
arguments.append({
"prefix": line_obj["prefix"],
"body_0": "\n".join(body[:body_part_len]),
"body_1": "\n".join(body[body_part_len: 2 * body_part_len]),
"body_2": "\n".join(body[2 * body_part_len: 3 * body_part_len]),
"body_3": "\n".join(body[3 * body_part_len:]),
"suffix": suffix,
})
arguments.append(
{
"prefix": line_obj["prefix"],
"body_0": "\n".join(body[:body_part_len]),
"body_1": "\n".join(body[body_part_len : 2 * body_part_len]),
"body_2": "\n".join(body[2 * body_part_len : 3 * body_part_len]),
"body_3": "\n".join(body[3 * body_part_len :]),
"suffix": suffix,
}
)
labels.append(label)
sum_src_indices.append(src_index)
sum_dst_indices.append(dst_index)
@@ -61,7 +72,12 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
tic = time.time()
states = line_retrieval.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
corrects = []
@@ -79,7 +95,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
if response_number == label:
break
correct = (response_number == label)
correct = response_number == label
corrects.append(correct)
# Log results
@@ -107,7 +123,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
"other": {
"num_questions": len(arguments),
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -4,12 +4,13 @@ Generate line data for line retrieval task.
Usage:
python3 gen_data.py --number 1000
"""
import argparse
from collections import defaultdict
import json
from tqdm import tqdm
import argparse
import json
from collections import defaultdict
import numpy as np
from tqdm import tqdm
def generate_lines(random_words, num_lines, redirect_ratio):
@@ -42,11 +43,14 @@ def generate_lines(random_words, num_lines, redirect_ratio):
# Add redirect
if redirect_ratio > 0:
num_redirect_lines = int(len(lines) * redirect_ratio)
redirect_indices = np.random.choice(np.arange(len(lines)),
size=(num_redirect_lines,), replace=False)
redirect_indices = np.random.choice(
np.arange(len(lines)), size=(num_redirect_lines,), replace=False
)
for i in redirect_indices:
target_idx = np.random.choice(min(i * 2 + 100, num_lines))
lines[i] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
lines[i] = (
f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
)
redirects[i] = target_idx
# Build links and find sources

View File

@@ -1,13 +1,16 @@
import argparse
import json
import time
import os
import time
import tqdm
import sglang as sgl
import tqdm
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from PIL import Image
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
@sgl.function
@@ -17,17 +20,19 @@ def image_qa(s, image_file, question):
def main(args):
lines = read_jsonl(args.question_file)[:args.num_questions]
lines = read_jsonl(args.question_file)[: args.num_questions]
arguments = [
{"image_file":
os.path.abspath(args.image_folder + "/" + l["image"]),
"question": l["text"]} for l in lines
{
"image_file": os.path.abspath(args.image_folder + "/" + l["image"]),
"question": l["text"],
}
for l in lines
]
#arguments = [
# arguments = [
# {"image_file":
# Image.open(os.path.abspath(args.image_folder + "/" + l["image"])),
# "question": l["text"]} for l in lines
#]
# ]
states = [None] * len(lines)
@@ -41,17 +46,12 @@ def main(args):
for i in tqdm.tqdm(range(len(lines))):
image_file = arguments[i]["image_file"]
question = arguments[i]["question"]
ret = image_qa.run(
image_file=image_file,
question=question,
temperature=0)
ret = image_qa.run(image_file=image_file, question=question, temperature=0)
states[i] = ret
else:
states = image_qa.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True)
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
print(f"Latency: {latency:.3f}")

View File

@@ -1,8 +1,8 @@
import os
# Create the 'images' directory if it doesn't exist
if not os.path.exists('images'):
os.makedirs('images')
if not os.path.exists("images"):
os.makedirs("images")
# Base URL
base_url = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/"

View File

@@ -1,27 +1,28 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
system_prompt = (
"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
dimension_prompts = [
"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
]
@@ -31,12 +32,16 @@ def multi_dimension_judge(article, generate):
judges = []
for i in range(len(dimension_prompts)):
comp = generate(s +
"USER: Please judge the quality based on the following metric. " +
dimension_prompts[i] + " Please provide a single-paragraph judgement. " +
"Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:',
max_tokens=256, stop="END")
comp = generate(
s
+ "USER: Please judge the quality based on the following metric. "
+ dimension_prompts[i]
+ " Please provide a single-paragraph judgement. "
+ "Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:',
max_tokens=256,
stop="END",
)
judges.append(comp)
s += "I will judge the quality based on the following metrics.\n"
@@ -50,7 +55,7 @@ def multi_dimension_judge(article, generate):
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
lines = read_jsonl(args.data_path)[: args.num_questions]
states = [None] * len(lines)
# Select backend
@@ -64,13 +69,20 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup
@@ -107,7 +119,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -2,23 +2,22 @@ import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
system_prompt = (
"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
dimension_prompts = [
"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
]
@@ -29,23 +28,31 @@ def multi_dimension_judge(s, article):
forks = s.fork(len(dimension_prompts))
for i in range(len(dimension_prompts)):
forks[i] += ("USER: Please judge the quality based on the following metric. " +
dimension_prompts[i] + " Please provide a single-paragraph judgement. " +
"Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:')
forks[i] += (
"USER: Please judge the quality based on the following metric. "
+ dimension_prompts[i]
+ " Please provide a single-paragraph judgement. "
+ "Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:'
)
forks[i] += sgl.gen("judgement", max_tokens=256, stop="END")
forks.join()
s += "I will judge the quality based on the following metrics.\n"
for i in range(len(dimension_prompts)):
s += dimension_prompts[i].split(":")[0] + ": " + forks[i]["judgement"].strip() + "\n"
s += (
dimension_prompts[i].split(":")[0]
+ ": "
+ forks[i]["judgement"].strip()
+ "\n"
)
s += "In summary, on a scale of 1 to 10, I would give the article a score of"
s += sgl.gen("score", max_tokens=2)
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
lines = read_jsonl(args.data_path)[: args.num_questions]
arguments = [{"article": l} for l in lines]
# Select backend
@@ -54,7 +61,12 @@ def main(args):
# Run requests
tic = time.time()
states = multi_dimension_judge.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
print(f"Latency: {latency:.3f}")
@@ -72,7 +84,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,21 +1,25 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
def json_decode(document, generate):
s = "Please extract the information of a city from the following wikipedia page.\n"
s += "Page begin.\n" + document + "Page end.\n"
s += "Here is the name, country, and symbol of the city in JSON format.\n"
s += '{\n'
s += "{\n"
s += ' "name": "'
s += generate(s, max_tokens=8, stop='"') + '",\n'
s += ' "country": "'
@@ -24,17 +28,19 @@ def json_decode(document, generate):
s += generate(s, max_tokens=8, stop='"') + '",\n'
s += ' "top 3 landmarks": "'
s += generate(s, max_tokens=24, stop='"') + '",\n'
s += '}\n'
s += "}\n"
return s
def main(args):
lines = read_jsonl(args.data_path)
arguments = []
for i in range(len(lines[:args.num_questions])):
arguments.append({
"document": lines[i]["document"],
})
for i in range(len(lines[: args.num_questions])):
arguments.append(
{
"document": lines[i]["document"],
}
)
states = [None] * len(arguments)
# Select backend
@@ -48,13 +54,20 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf", n_gpu_layers=-1, n_ctx=11000)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf",
n_gpu_layers=-1,
n_ctx=11000,
)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup
@@ -91,7 +104,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -2,10 +2,12 @@ import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
@sgl.function
@@ -13,21 +15,31 @@ def json_decode(s, document):
s += "Please extract the information of a city from the following wikipedia page.\n"
s += "Page begin.\n" + document + "Page end.\n"
s += "Here is the name, country, and symbol of the city in JSON format.\n"
s += '{\n'
s += "{\n"
s += ' "name": "' + sgl.gen("name", max_tokens=8, stop='"') + '",\n'
s += ' "country": "' + sgl.gen("country", max_tokens=8, stop='"') + '",\n'
s += ' "air port code": "' + sgl.gen("air port code", max_tokens=8, stop='"') + '",\n'
s += ' "top 3 landmarks": "' + sgl.gen("landmarks", max_tokens=24, stop='"') + '",\n'
s += '}\n'
s += (
' "air port code": "'
+ sgl.gen("air port code", max_tokens=8, stop='"')
+ '",\n'
)
s += (
' "top 3 landmarks": "'
+ sgl.gen("landmarks", max_tokens=24, stop='"')
+ '",\n'
)
s += "}\n"
def main(args):
lines = read_jsonl(args.data_path)
arguments = []
for i in range(len(lines[:args.num_questions])):
arguments.append({
"document": lines[i]["document"],
})
for i in range(len(lines[: args.num_questions])):
arguments.append(
{
"document": lines[i]["document"],
}
)
# Select backend
backend = select_sglang_backend(args)
@@ -36,10 +48,11 @@ def main(args):
# Run requests
tic = time.time()
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True)
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
# Compute accuracy
# Compute accuracy
print(f"Latency: {latency:.3f}")
# Write results
@@ -55,7 +68,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -3,7 +3,6 @@ import json
import transformers
import wikipedia
name = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(name)
city_names = ["los angles", "london", "tokyo", "beijing", "singapore"]
@@ -20,7 +19,9 @@ for city_name in city_names:
truncate_tokens = t.encode(truncate_content)
# Count token
print(f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}")
print(
f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
)
with open("questions.jsonl", "a") as fout:
fout.write(json.dumps({"document": truncate_content}) + "\n")

View File

@@ -1,17 +1,22 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
import json
from functools import partial
import os
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
choices = ["A", "B", "C", "D"]
@@ -25,18 +30,22 @@ def format_subject(subject):
s += " " + entry
return s
def format_example(df, idx, include_answer=True):
prompt = df.iloc[idx, 0]
k = df.shape[1] - 2
for j in range(k):
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
prompt += "\nAnswer:"
if include_answer:
prompt += " {}\n\n".format(df.iloc[idx, k + 1])
return prompt
def gen_prompt(train_df, subject, k=-1):
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
format_subject(subject)
)
if k == -1:
k = train_df.shape[0]
for i in range(k):
@@ -63,7 +72,7 @@ def evaluate(args, subject, dev_df, test_df):
prompt = train_prompt + prompt_end
prompts.append(prompt)
label = test_df.iloc[i, test_df.shape[1]-1]
label = test_df.iloc[i, test_df.shape[1] - 1]
labels.append(label)
preds = [None] * len(prompts)
@@ -82,17 +91,24 @@ def evaluate(args, subject, dev_df, test_df):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url, stop=None)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
if model_initialized is None:
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
model_initialized = model
else:
model = model_initialized
def call_generate(prompt, temperature, max_tokens):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0)
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0)
)
return out["answer"]
# warmup
@@ -100,8 +116,10 @@ def evaluate(args, subject, dev_df, test_df):
elif args.backend == "lmql":
import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
endpoint=f"{args.host}:{args.port}")
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(question):
@@ -112,6 +130,7 @@ def evaluate(args, subject, dev_df, test_df):
async def call_generate(prompt, temperature, max_tokens):
return await program(question=prompt, temperature=temperature)
else:
raise ValueError(f"Invalid backend: {args.backend}")
@@ -119,8 +138,7 @@ def evaluate(args, subject, dev_df, test_df):
if args.backend != "lmql":
# Use thread pool
def get_one_answer(i):
pred = call_generate(prompts[i], temperature=0,
max_tokens=max_tokens)
pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
preds[i] = pred.strip()[0]
tic = time.time()
@@ -135,12 +153,11 @@ def evaluate(args, subject, dev_df, test_df):
async def batched_call(batch_size):
for i in range(0, len(prompts), batch_size):
tasks = []
for p in prompts[i:i+batch_size]:
tasks.append(call_generate(p,
temperature=0, max_tokens=max_tokens))
for p in prompts[i : i + batch_size]:
tasks.append(call_generate(p, temperature=0, max_tokens=max_tokens))
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
preds[i+j] = rets[j].strip()[0]
preds[i + j] = rets[j].strip()[0]
tic = time.time()
asyncio.run(batched_call(batch_size=args.parallel))
@@ -151,22 +168,35 @@ def evaluate(args, subject, dev_df, test_df):
acc = np.mean(cors)
cors = np.array(cors)
print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
acc, latency, len(prompts), subject))
print(
"Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
acc, latency, len(prompts), subject
)
)
return cors, acc, latency
def main(args):
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
subjects = sorted(
[
f.split("_test.csv")[0]
for f in os.listdir(os.path.join(args.data_dir, "test"))
if "_test.csv" in f
]
)
all_cors = []
all_latencies = []
num_requests = 0
for subject in tqdm(subjects[:args.nsub]):
dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
for subject in tqdm(subjects[: args.nsub]):
dev_df = pd.read_csv(
os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
)[: args.ntrain]
test_df = pd.read_csv(
os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
)
cors, acc, latency = evaluate(args, subject, dev_df, test_df)
all_cors.append(cors)
@@ -191,7 +221,7 @@ def main(args):
"other": {
"nsub": args.nsub,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -7,8 +7,11 @@ import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
choices = ["A", "B", "C", "D"]
@@ -22,24 +25,29 @@ def format_subject(subject):
s += " " + entry
return s
def format_example(df, idx, include_answer=True):
prompt = df.iloc[idx, 0]
k = df.shape[1] - 2
for j in range(k):
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
prompt += "\nAnswer:"
if include_answer:
prompt += " {}\n\n".format(df.iloc[idx, k + 1])
return prompt
def gen_prompt(train_df, subject, k=-1):
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
format_subject(subject)
)
if k == -1:
k = train_df.shape[0]
for i in range(k):
prompt += format_example(train_df, i)
return prompt
def evaluate(args, subject, dev_df, test_df):
prompts = []
labels = []
@@ -54,7 +62,7 @@ def evaluate(args, subject, dev_df, test_df):
prompt_end = format_example(test_df, i, include_answer=False)
prompts.append(prompt_end)
label = test_df.iloc[i, test_df.shape[1]-1]
label = test_df.iloc[i, test_df.shape[1] - 1]
labels.append(label)
arguments = [{"question": p} for p in prompts]
@@ -66,11 +74,14 @@ def evaluate(args, subject, dev_df, test_df):
import sglang as sgl
if args.backend.startswith("gpt-"):
@sgl.function
def few_shot_mmlu(s, examples, question):
s += sgl.user(examples + question)
s += sgl.assistant(sgl.gen("answer"))
else:
@sgl.function
def few_shot_mmlu(s, examples, question):
s += examples + question + sgl.gen("answer")
@@ -84,32 +95,50 @@ def evaluate(args, subject, dev_df, test_df):
tic = time.time()
states = few_shot_mmlu.bind(examples=few_shot_examples).run_batch(
arguments, temperature=0, max_new_tokens=1,
backend=backend, num_threads=args.parallel)
preds = [s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else ""
for s in states]
arguments,
temperature=0,
max_new_tokens=1,
backend=backend,
num_threads=args.parallel,
)
preds = [
s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
]
latency = time.time() - tic
cors = [pred == label for pred, label in zip(preds, labels)]
acc = np.mean(cors)
cors = np.array(cors)
print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
acc, latency, len(prompts), subject))
print(
"Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
acc, latency, len(prompts), subject
)
)
return cors, acc, latency
def main(args):
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
subjects = sorted(
[
f.split("_test.csv")[0]
for f in os.listdir(os.path.join(args.data_dir, "test"))
if "_test.csv" in f
]
)
all_cors = []
all_latencies = []
num_requests = 0
for subject in tqdm(subjects[:args.nsub]):
dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
for subject in tqdm(subjects[: args.nsub]):
dev_df = pd.read_csv(
os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
)[: args.ntrain]
test_df = pd.read_csv(
os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
)
cors, acc, latency = evaluate(args, subject, dev_df, test_df)
all_cors.append(cors)
@@ -134,7 +163,7 @@ def main(args):
"other": {
"nsub": args.nsub,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,14 +1,19 @@
import argparse
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import os
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from fastchat.model import get_conversation_template
import requests
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt,
call_generate_vllm,
)
def load_questions(filename):
@@ -38,7 +43,7 @@ def write_answers(filename, model_id, questions, answers):
def main(args):
questions = load_questions(args.question_file)
questions = (questions * 10)[:args.num_questions]
questions = (questions * 10)[: args.num_questions]
max_tokens = 256
model_id = "llama-2-chat"
@@ -67,9 +72,8 @@ def main(args):
conv.append_message(conv.roles[0], q)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
output = call_generate(prompt,
temperature=0, max_tokens=max_tokens).strip()
prompt = conv.get_prompt()
output = call_generate(prompt, temperature=0, max_tokens=max_tokens).strip()
cur_answers.append(output)
conv.update_last_message(output)
@@ -102,7 +106,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -5,7 +5,10 @@ import time
import uuid
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
def load_questions(filename):
@@ -44,10 +47,9 @@ def answer_mt_bench(s, question_1, question_2):
def main(args):
# Construct prompts
questions = load_questions(args.question_file)[:args.num_questions]
questions = load_questions(args.question_file)[: args.num_questions]
arguments = [
{"question_1": q["turns"][0], "question_2": q["turns"][1]}
for q in questions
{"question_1": q["turns"][0], "question_2": q["turns"][1]} for q in questions
]
# Select backend
@@ -83,7 +85,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,23 +1,28 @@
import argparse
import ast
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -44,14 +49,20 @@ def multi_chain_gsm8k(question, num_chains, call_generate):
comps = []
for i in range(num_chains):
comps.append(call_generate(s + "Answer: " + prompt_lib[i % num_chains],
max_tokens=256, temperature=0.3, stop="Question"))
comps.append(
call_generate(
s + "Answer: " + prompt_lib[i % num_chains],
max_tokens=256,
temperature=0.3,
stop="Question",
)
)
s += "Answer: To answer this question, here are some possible solutions. "
s += "After considering all of them, I will do a majority vote.\n\n"
for i in range(num_chains):
s += f"Solution {i+1}: " + comps[i].strip() + "\n\n"
s += f"\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
s += call_generate(s, max_tokens=16, temperature=0, stop=None)
return s
@@ -64,7 +75,7 @@ def main(args):
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(lines[i]["question"])
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -82,16 +93,28 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=temperature, stop=stop)
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["answer"]
#def multi_chain_gsm8k(question, num_chains, call_generate):
# def multi_chain_gsm8k(question, num_chains, call_generate):
# s = model + "Question: " + question + "\n"
# comps = []
@@ -108,8 +131,10 @@ def main(args):
elif args.backend == "lmql":
import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
endpoint=f"{args.host}:{args.port}")
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(question):
@@ -128,8 +153,7 @@ def main(args):
if args.backend != "lmql":
# Use thread pool
def get_one_answer(i):
answer = multi_chain_gsm8k(questions[i], args.num_chains,
call_generate)
answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
states[i] = answer
tic = time.time()
@@ -144,12 +168,18 @@ def main(args):
async def batched_call(batch_size):
for i in range(0, len(questions), batch_size):
tasks = []
for q in questions[i:i+batch_size]:
tasks.append(call_generate(few_shot_examples + q,
temperature=0, max_tokens=256, stop="Question"))
for q in questions[i : i + batch_size]:
tasks.append(
call_generate(
few_shot_examples + q,
temperature=0,
max_tokens=256,
stop="Question",
)
)
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
states[i+j] = get_answer_value(rets[j])
states[i + j] = get_answer_value(rets[j])
tic = time.time()
asyncio.run(batched_call(batch_size=args.parallel))
@@ -180,7 +210,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -5,16 +5,19 @@ import re
import time
import numpy as np
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -37,12 +40,12 @@ def main(args):
lines = read_jsonl(args.data_path)
# Construct prompts
#k = args.num_shot
#few_shot_examples = get_few_shot_examples(lines, k)
# k = args.num_shot
# few_shot_examples = get_few_shot_examples(lines, k)
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(lines[i]["question"])
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -59,21 +62,24 @@ def main(args):
@sgl.function
def multi_chain_gsm8k(s, question):
s += "Question: " + question + "\n"
#s += "Answer: " + prompt_lib[0] + sgl.gen("answer", max_tokens=256, stop="Question",
# s += "Answer: " + prompt_lib[0] + sgl.gen("answer", max_tokens=256, stop="Question",
# temperature=0)
#return
# return
forks = s.fork(num_chains)
for i in range(num_chains):
forks[i] += ("Answer: " + prompt_lib[i % num_chains] +
sgl.gen(f"chain", max_tokens=256, temperature=0.3, stop="Question"))
forks[i] += (
"Answer: "
+ prompt_lib[i % num_chains]
+ sgl.gen("chain", max_tokens=256, temperature=0.3, stop="Question")
)
forks.join()
s += "Answer: To answer this question, here are some possible solutions. "
s += "After considering all of them, I will do a majority vote.\n\n"
for i in range(num_chains):
s += f"Solution {i+1}: " + forks[i]["chain"].strip() + "\n\n"
s += f"\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
s += sgl.gen("answer", max_tokens=16)
#####################################
@@ -86,7 +92,12 @@ def main(args):
# Run requests
tic = time.time()
states = multi_chain_gsm8k.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
preds = []
@@ -114,7 +125,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,15 +1,18 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
USER_PREFIX = "[INST] "
USER_SUFFIX = " [/INST]"
@@ -25,7 +28,11 @@ def multi_document_qa(docs, question, generate):
s += "".join(docs)
s += "\nDocuments end."
s += ("\n\nBased on the above documents, please answer this question:\n" + question + "\nAnswer in three words or fewer.")
s += (
"\n\nBased on the above documents, please answer this question:\n"
+ question
+ "\nAnswer in three words or fewer."
)
s += USER_SUFFIX
s += ASSISTANT_PREFIX
answer = generate(s, max_tokens=16, stop=None)
@@ -42,11 +49,13 @@ def main(args):
if args.backend == "guidance":
num_docs = 7 # due to OOM
for i in range(len(l["questions"][:args.num_questions])):
arguments.append({
"docs": l["documents"][:num_docs],
"question": l["questions"][i],
})
for i in range(len(l["questions"][: args.num_questions])):
arguments.append(
{
"docs": l["documents"][:num_docs],
"question": l["questions"][i],
}
)
labels.append(l["answers"][i])
states = [None] * len(arguments)
@@ -61,13 +70,20 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf", n_gpu_layers=-1, n_ctx=11000)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf",
n_gpu_layers=-1,
n_ctx=11000,
)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup
@@ -113,7 +129,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -2,10 +2,12 @@ import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
@sgl.function
@@ -19,7 +21,11 @@ def multi_document_qa(s, docs, question):
forks.join("concate_and_append")
s += "\nDocuments end."
s += ("\n\nBased on the above documents, please answer this question:\n" + question + "\nAnswer in three words or fewer.")
s += (
"\n\nBased on the above documents, please answer this question:\n"
+ question
+ "\nAnswer in three words or fewer."
)
s += sgl.user_end()
s += sgl.assistant(sgl.gen("answer", max_tokens=16))
@@ -29,11 +35,13 @@ def main(args):
l = lines[0]
arguments = []
labels = []
for i in range(len(l["questions"][:args.num_questions])):
arguments.append({
"docs": l["documents"][:10],
"question": l["questions"][i],
})
for i in range(len(l["questions"][: args.num_questions])):
arguments.append(
{
"docs": l["documents"][:10],
"question": l["questions"][i],
}
)
labels.append(l["answers"][i])
# Select backend
@@ -43,10 +51,11 @@ def main(args):
# Run requests
tic = time.time()
states = multi_document_qa.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True)
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
# Compute accuracy
# Compute accuracy
print([s["answer"] for s in states])
correct = 0
for s, label in zip(states, labels):
@@ -71,7 +80,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -3,7 +3,8 @@ import json
import transformers
content = "\n".join(
open("llama2.txt", 'r', encoding='utf-8', errors='ignore').readlines())
open("llama2.txt", "r", encoding="utf-8", errors="ignore").readlines()
)
content = content.replace("\n\n", "\n")
# Count token
@@ -35,30 +36,35 @@ for i, s in enumerate(segments):
# Dump
with open("questions.jsonl", "w") as fout:
fout.write(json.dumps({
"documents": segments[:30],
"questions": [
"What is the name of the fine-tuned LLMs?",
"Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
"What is the number of parameters in the largest Llama 2 model?",
"What is the batch size of fine-tuning?",
"Where can we find the details of potential data contamination?",
"What is the full name of MPT?",
"What is the power consumption of RSC in Watt?",
"How many tokens of data do they train on?",
"Which model's release is delayed due to a lack of time to sufficiently red team?",
"Which activation function is used in Llama?"
],
"answers": [
"Llama 2 Chat",
"1",
"70 B",
"64",
"A 6",
"MosaicML",
"400",
"2 trillion",
"34 B",
"SwiGLU",
],
}) + "\n")
fout.write(
json.dumps(
{
"documents": segments[:30],
"questions": [
"What is the name of the fine-tuned LLMs?",
"Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
"What is the number of parameters in the largest Llama 2 model?",
"What is the batch size of fine-tuning?",
"Where can we find the details of potential data contamination?",
"What is the full name of MPT?",
"What is the power consumption of RSC in Watt?",
"How many tokens of data do they train on?",
"Which model's release is delayed due to a lack of time to sufficiently red team?",
"Which activation function is used in Llama?",
],
"answers": [
"Llama 2 Chat",
"1",
"70 B",
"64",
"A 6",
"MosaicML",
"400",
"2 trillion",
"34 B",
"SwiGLU",
],
}
)
+ "\n"
)

View File

@@ -4,12 +4,12 @@ from argparse import ArgumentParser
from concurrent.futures import ThreadPoolExecutor
import requests
from sglang.test.test_utils import add_common_other_args_and_parse
from sglang.utils import dump_state_text
from data_gen import gen_arguments
from tqdm import tqdm
from vllm.transformers_utils.tokenizer import get_tokenizer
from data_gen import gen_arguments
from sglang.test.test_utils import add_common_other_args_and_parse
from sglang.utils import dump_state_text
def get_generate(args):
@@ -61,7 +61,7 @@ def multi_turns(generate, qas):
s = ""
for qa in qas:
s += qa["prompt"]
s += generate(s, max_tokens=qa["new_tokens"])
s += generate(s, max_tokens=qa["new_tokens"])
return s

View File

@@ -2,22 +2,22 @@ import json
import time
from argparse import ArgumentParser
from data_gen import gen_arguments
from vllm.transformers_utils.tokenizer import get_tokenizer
import sglang as sgl
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text
from vllm.transformers_utils.tokenizer import get_tokenizer
from data_gen import gen_arguments
@sgl.function
def multi_turns(s, qas):
for qa in qas:
s += qa["prompt"]
s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
def main(args):
@@ -29,7 +29,11 @@ def main(args):
tic = time.time()
states = multi_turns.run_batch(
multi_qas, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True
multi_qas,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic

View File

@@ -1,18 +1,19 @@
import argparse
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from pathlib import Path
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_vllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import read_jsonl, dump_state_text
from sglang.utils import dump_state_text, read_jsonl
def get_prompt(question):
@@ -83,16 +84,15 @@ Action 2: Search[Leonid Levin]
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
Action 3: Finish[yes]
""" + question)
"""
+ question
)
return prompt
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
arguments = [{
"question": k,
"triplets": v
} for l in lines for k, v in l.items()]
lines = read_jsonl(args.data_path)[: args.num_questions]
arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]
states = []
@@ -107,7 +107,7 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp(
str(Path.home()) + "/model_weights/Llama-2-7b-chat.gguf",
@@ -116,12 +116,16 @@ def main(args):
)
def call_generate(prompt, temperature, max_tokens, stop):
out = (model + prompt + gen(
name="result",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
))
out = (
model
+ prompt
+ gen(
name="result",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["result"]
# warmup
@@ -137,15 +141,23 @@ def main(args):
for i in range(1, len(triplets) + 2):
prompt += "Thought " + str(i) + ":"
states.append(prompt)
answer = call_generate(prompt,
max_tokens=200,
temperature=0,
stop="Observation")
answer = call_generate(
prompt, max_tokens=200, temperature=0, stop="Observation"
)
if i > len(triplets):
break
prompt += (triplets[i - 1]["thought"] + "\nAction " + str(i) +
":" + triplets[i - 1]["action"] + "\nObservation " +
str(i) + ":" + triplets[i - 1]["observation"] + "\n")
prompt += (
triplets[i - 1]["thought"]
+ "\nAction "
+ str(i)
+ ":"
+ triplets[i - 1]["action"]
+ "\nObservation "
+ str(i)
+ ":"
+ triplets[i - 1]["observation"]
+ "\n"
)
states.append(answer)

View File

@@ -7,7 +7,7 @@ from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import read_jsonl, dump_state_text
from sglang.utils import dump_state_text, read_jsonl
@sgl.function
@@ -79,7 +79,9 @@ Action 2: Search[Leonid Levin]
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
Action 3: Finish[yes]
""" + question)
"""
+ question
)
for i in range(1, len(triplets) + 2):
s += "Thought " + str(i) + ":"
# NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
@@ -90,17 +92,23 @@ Action 3: Finish[yes]
# print(ss[0]["thought_action"])
if i > len(triplets):
break
s += (triplets[i - 1]["thought"] + "\nAction " + str(i) + ":" +
triplets[i - 1]["action"] + "\nObservation " + str(i) + ":" +
triplets[i - 1]["observation"] + "\n")
s += (
triplets[i - 1]["thought"]
+ "\nAction "
+ str(i)
+ ":"
+ triplets[i - 1]["action"]
+ "\nObservation "
+ str(i)
+ ":"
+ triplets[i - 1]["observation"]
+ "\n"
)
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
arguments = [{
"question": k,
"triplets": v
} for l in lines for k, v in l.items()]
lines = read_jsonl(args.data_path)[: args.num_questions]
arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]
# Select backend
backend = select_sglang_backend(args)
@@ -108,11 +116,12 @@ def main(args):
states = []
tic = time.time()
states = webthink.run_batch(arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
states = webthink.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
# Compute accuracy

View File

@@ -1,22 +1,25 @@
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from tqdm import tqdm
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
number = 5
def expand_tip(topic, tip, generate):
s = (
"""Please expand a tip for a topic into a detailed paragraph.
"""Please expand a tip for a topic into a detailed paragraph.
Topic: staying healthy
Tip: Regular Exercise
@@ -30,14 +33,23 @@ Topic: writing a blog post
Tip: structure your content effectively
Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
Topic: """ + topic + "\nTip: " + tip + "\nParagraph:")
Topic: """
+ topic
+ "\nTip: "
+ tip
+ "\nParagraph:"
)
return generate(s, max_tokens=128, stop=["\n\n"])
def suggest_tips(topic, generate):
s = "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
s += "USER: Give some tips for " + topic + ".\n"
s += ("ASSISTANT: Okay. Here are " + str(number) + " concise tips, each under 8 words:\n")
s += (
"ASSISTANT: Okay. Here are "
+ str(number)
+ " concise tips, each under 8 words:\n"
)
tips = []
for i in range(1, 1 + number):
@@ -49,12 +61,12 @@ def suggest_tips(topic, generate):
paragraphs = [expand_tip(topic, tip, generate=generate) for tip in tips]
for i in range(1, 1 + number):
s += f"Tip {i}:" + paragraphs[i-1] + "\n"
s += f"Tip {i}:" + paragraphs[i - 1] + "\n"
return s
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
lines = read_jsonl(args.data_path)[: args.num_questions]
states = [None] * len(lines)
# Select backend
@@ -68,13 +80,20 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup
@@ -111,7 +130,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -2,11 +2,12 @@ import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
number = 5
@@ -14,7 +15,7 @@ number = 5
@sgl.function
def expand_tip(s, topic, tip):
s += (
"""Please expand a tip for a topic into a detailed paragraph.
"""Please expand a tip for a topic into a detailed paragraph.
Topic: staying healthy
Tip: Regular Exercise
@@ -28,7 +29,12 @@ Topic: writing a blog post
Tip: structure your content effectively
Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
Topic: """ + topic + "\nTip: " + tip + "\nParagraph:")
Topic: """
+ topic
+ "\nTip: "
+ tip
+ "\nParagraph:"
)
s += sgl.gen("paragraph", max_tokens=128, stop=["\n\n"], temperature=0)
@@ -36,7 +42,11 @@ Topic: """ + topic + "\nTip: " + tip + "\nParagraph:")
def suggest_tips(s, topic):
s += "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
s += "USER: Give some tips for " + topic + ".\n"
s += ("ASSISTANT: Okay. Here are " + str(number) + " concise tips, each under 8 words:\n")
s += (
"ASSISTANT: Okay. Here are "
+ str(number)
+ " concise tips, each under 8 words:\n"
)
paragraphs = []
for i in range(1, 1 + number):
@@ -44,14 +54,12 @@ def suggest_tips(s, topic):
paragraphs.append(expand_tip(topic=topic, tip=s[f"tip_{i}"]))
for i in range(1, 1 + number):
s += f"Tip {i}:" + paragraphs[i-1]["paragraph"] + "\n"
s += f"Tip {i}:" + paragraphs[i - 1]["paragraph"] + "\n"
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
arguments = [
{"topic": l["topic"]} for l in lines
]
lines = read_jsonl(args.data_path)[: args.num_questions]
arguments = [{"topic": l["topic"]} for l in lines]
# Select backend
sgl.set_default_backend(select_sglang_backend(args))
@@ -59,7 +67,8 @@ def main(args):
# Run requests
tic = time.time()
states = suggest_tips.run_batch(
arguments, temperature=0, num_threads=args.parallel, progress_bar=True)
arguments, temperature=0, num_threads=args.parallel, progress_bar=True
)
latency = time.time() - tic
# Compute accuracy
@@ -78,7 +87,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,25 +1,29 @@
import argparse
import ast
import asyncio
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import re
import time
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -47,35 +51,56 @@ temp = 0.001
def propose_plan(s, question, num_branches, call_generate):
s += (USER_PREFIX +
"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question + USER_SUFFIX)
s += (
USER_PREFIX
+ """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+ question
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
def execute_plan(s, num_branches, call_generate):
s += (USER_PREFIX +
"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""" + USER_SUFFIX)
s += (
USER_PREFIX
+ """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
def reflect_solution(s, num_branches, call_generate):
s += (USER_PREFIX +
"""Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""" + USER_SUFFIX)
s += (
USER_PREFIX
+ """Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
def get_final_answer(s, num_branches, call_generate):
s += (USER_PREFIX +
"""Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration.""" + USER_SUFFIX)
s += (
USER_PREFIX
+ """Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
@@ -107,7 +132,7 @@ def main(args):
num_branches = 2
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(lines[i]["question"])
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -124,20 +149,40 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop, n):
if n == 1:
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=temperature, stop=stop)
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["answer"]
else:
rets = []
for i in range(n):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=temperature, stop=stop)
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
rets.append(out["answer"])
return rets
@@ -146,6 +191,7 @@ def main(args):
# Run requests
states = [None] * len(questions)
def get_one_answer(i):
states[i] = tree_search(**arguments[i], call_generate=call_generate)
@@ -188,7 +234,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,22 +1,25 @@
import argparse
import ast
from collections import Counter
import json
import re
import time
from collections import Counter
import numpy as np
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
import sglang as sgl
import sglang as sgl
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -40,7 +43,9 @@ temp = 0.001
def propose_plan(s, question, num_branches):
s += sgl.user(
"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question)
"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+ question
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("plan", max_tokens=256, temperature=temp))
return forks
@@ -48,7 +53,8 @@ def propose_plan(s, question, num_branches):
def execute_plan(s, num_branches):
s += sgl.user(
"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""")
"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("answer", max_tokens=256, temperature=temp))
return forks
@@ -56,7 +62,8 @@ def execute_plan(s, num_branches):
def reflect_solution(s, num_branches):
s += sgl.user(
"""Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""")
"""Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("score", max_tokens=256, temperature=temp))
return forks
@@ -64,13 +71,13 @@ def reflect_solution(s, num_branches):
def get_final_answer(s, num_branches):
s += sgl.user(
"""Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration.""")
"""Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("final_answer", max_tokens=256, temperature=temp))
return forks
@sgl.function
def tree_search(s, question, num_branches):
plan_forks = propose_plan(s, question, num_branches)
@@ -93,6 +100,7 @@ def tree_search(s, question, num_branches):
return solutions
def main(args):
lines = read_jsonl(args.data_path)
@@ -100,7 +108,7 @@ def main(args):
num_branches = 2
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(lines[i]["question"])
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -112,7 +120,12 @@ def main(args):
# Run requests
tic = time.time()
states = tree_search.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
answers_text = []
for s in states:
@@ -144,7 +157,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,25 +1,29 @@
import argparse
import ast
import asyncio
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import re
import time
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -47,27 +51,43 @@ temp = 0.3
def propose_plan(s, question, num_branches, call_generate):
s += (USER_PREFIX +
"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question + USER_SUFFIX)
s += (
USER_PREFIX
+ """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+ question
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
def execute_plan(s, num_branches, call_generate):
s += (USER_PREFIX +
"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""" + USER_SUFFIX)
s += (
USER_PREFIX
+ """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
def reflect_solution(s, num_branches, call_generate):
s += (USER_PREFIX +
"""Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""" + USER_SUFFIX)
s += (
USER_PREFIX
+ """Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+ USER_SUFFIX
)
s += ASSISTANT_PREFIX
comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
comps = call_generate(
s, max_tokens=256, temperature=temp, stop=None, n=num_branches
)
return [s + comp + ASSISTANT_SUFFIX for comp in comps]
@@ -92,7 +112,7 @@ def main(args):
num_branches = 3
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(lines[i]["question"])
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -109,25 +129,46 @@ def main(args):
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import models, gen
from guidance import gen, models
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop, n):
if n == 1:
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=temperature, stop=stop)
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["answer"]
else:
rets = []
for i in range(n):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=temperature, stop=stop)
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
rets.append(out["answer"])
return rets
# Run requests
states = [None] * len(questions)
def get_one_answer(i):
states[i] = tree_search(**arguments[i], call_generate=call_generate)
@@ -170,7 +211,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -1,22 +1,25 @@
import argparse
import ast
from collections import Counter
import json
import re
import time
from collections import Counter
import numpy as np
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
import sglang as sgl
import sglang as sgl
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
def get_answer_value(answer_str):
answer_str = answer_str.replace(",", "")
numbers = re.findall(r'\d+', answer_str)
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
@@ -40,7 +43,9 @@ temp = 0.3
def propose_plan(s, question, num_branches):
s += sgl.user(
"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question)
"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+ question
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("plan", max_tokens=256, temperature=temp))
return forks
@@ -48,7 +53,8 @@ def propose_plan(s, question, num_branches):
def execute_plan(s, num_branches):
s += sgl.user(
"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""")
"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("answer", max_tokens=256, temperature=temp))
return forks
@@ -56,7 +62,8 @@ def execute_plan(s, num_branches):
def reflect_solution(s, num_branches):
s += sgl.user(
"""Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""")
"""Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
)
forks = s.fork(num_branches)
forks += sgl.assistant(sgl.gen("score", max_tokens=256, temperature=temp))
return forks
@@ -90,7 +97,7 @@ def main(args):
num_branches = 3
questions = []
labels = []
for i in range(len(lines[:args.num_questions])):
for i in range(len(lines[: args.num_questions])):
questions.append(lines[i]["question"])
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
@@ -102,7 +109,12 @@ def main(args):
# Run requests
tic = time.time()
states = tree_search.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
arguments,
temperature=0,
backend=backend,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic
answers_text = []
for s in states:
@@ -134,7 +146,7 @@ def main(args):
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
},
}
fout.write(json.dumps(value) + "\n")

View File

@@ -3,3 +3,6 @@ black python
isort test
black test
isort benchmark
black benchmark