misc: add pre-commit config (#637)
This commit is contained in:
9
.pre-commit-config.yaml
Normal file
9
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
repos:
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.13.2
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/psf/black
|
||||
rev: stable
|
||||
hooks:
|
||||
- id: black
|
||||
@@ -312,8 +312,8 @@ def main(args: argparse.Namespace):
|
||||
np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time
|
||||
)
|
||||
|
||||
#latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
|
||||
#print(latencies)
|
||||
# latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
|
||||
# print(latencies)
|
||||
|
||||
print(f"Total time: {benchmark_time:.2f} s")
|
||||
print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
|
||||
|
||||
@@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio):
|
||||
)
|
||||
for i in redirect_indices:
|
||||
target_idx = np.random.choice(min(i * 2 + 100, num_lines))
|
||||
lines[
|
||||
i
|
||||
] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
|
||||
lines[i] = (
|
||||
f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
|
||||
)
|
||||
redirects[i] = target_idx
|
||||
|
||||
# Build links and find sources
|
||||
|
||||
@@ -3,6 +3,7 @@ Usage:
|
||||
export ANTHROPIC_API_KEY=sk-******
|
||||
python3 anthropic_example_chat.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -30,7 +31,7 @@ def stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -39,13 +40,18 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = multi_turn_question.run_batch([
|
||||
{"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions."},
|
||||
|
||||
{"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?"},
|
||||
])
|
||||
states = multi_turn_question.run_batch(
|
||||
[
|
||||
{
|
||||
"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions.",
|
||||
},
|
||||
{
|
||||
"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?",
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s.messages())
|
||||
|
||||
@@ -9,15 +9,14 @@ import sglang as sgl
|
||||
|
||||
@sgl.function
|
||||
def few_shot_qa(s, question):
|
||||
s += (
|
||||
"""
|
||||
s += """
|
||||
\n\nHuman: What is the capital of France?
|
||||
\n\nAssistant: Paris
|
||||
\n\nHuman: What is the capital of Germany?
|
||||
\n\nAssistant: Berlin
|
||||
\n\nHuman: What is the capital of Italy?
|
||||
\n\nAssistant: Rome
|
||||
""")
|
||||
"""
|
||||
s += "\n\nHuman: " + question + "\n"
|
||||
s += "\n\nAssistant:" + sgl.gen("answer", temperature=0)
|
||||
|
||||
@@ -33,8 +32,8 @@ def single():
|
||||
|
||||
def stream():
|
||||
state = few_shot_qa.run(
|
||||
question="What is the capital of the United States?",
|
||||
stream=True)
|
||||
question="What is the capital of the United States?", stream=True
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -42,10 +41,12 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = few_shot_qa.run_batch([
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
])
|
||||
states = few_shot_qa.run_batch(
|
||||
[
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s["answer"])
|
||||
|
||||
@@ -3,9 +3,11 @@ Usage:
|
||||
export AZURE_OPENAI_API_KEY=sk-******
|
||||
python3 openai_example_chat.py
|
||||
"""
|
||||
import sglang as sgl
|
||||
|
||||
import os
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def multi_turn_question(s, question_1, question_2):
|
||||
@@ -32,7 +34,7 @@ def stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -41,13 +43,18 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = multi_turn_question.run_batch([
|
||||
{"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions."},
|
||||
|
||||
{"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?"},
|
||||
])
|
||||
states = multi_turn_question.run_batch(
|
||||
[
|
||||
{
|
||||
"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions.",
|
||||
},
|
||||
{
|
||||
"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?",
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s.messages())
|
||||
|
||||
@@ -3,6 +3,7 @@ Usage:
|
||||
export GCP_PROJECT_ID=******
|
||||
python3 gemini_example_chat.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -30,7 +31,7 @@ def stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -39,13 +40,18 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = multi_turn_question.run_batch([
|
||||
{"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions."},
|
||||
|
||||
{"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?"},
|
||||
])
|
||||
states = multi_turn_question.run_batch(
|
||||
[
|
||||
{
|
||||
"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions.",
|
||||
},
|
||||
{
|
||||
"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?",
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s.messages())
|
||||
|
||||
@@ -9,15 +9,14 @@ import sglang as sgl
|
||||
|
||||
@sgl.function
|
||||
def few_shot_qa(s, question):
|
||||
s += (
|
||||
"""The following are questions with answers.
|
||||
s += """The following are questions with answers.
|
||||
Q: What is the capital of France?
|
||||
A: Paris
|
||||
Q: What is the capital of Germany?
|
||||
A: Berlin
|
||||
Q: What is the capital of Italy?
|
||||
A: Rome
|
||||
""")
|
||||
"""
|
||||
s += "Q: " + question + "\n"
|
||||
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
|
||||
|
||||
@@ -33,8 +32,8 @@ def single():
|
||||
|
||||
def stream():
|
||||
state = few_shot_qa.run(
|
||||
question="What is the capital of the United States?",
|
||||
stream=True)
|
||||
question="What is the capital of the United States?", stream=True
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -42,10 +41,12 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = few_shot_qa.run_batch([
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
])
|
||||
states = few_shot_qa.run_batch(
|
||||
[
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s["answer"])
|
||||
|
||||
@@ -3,6 +3,7 @@ Usage:
|
||||
export GCP_PROJECT_ID=******
|
||||
python3 gemini_example_multimodal_chat.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -19,7 +20,7 @@ if __name__ == "__main__":
|
||||
image_file1="./images/cat.jpeg",
|
||||
image_file2="./images/dog.jpeg",
|
||||
question="Describe difference of the two images in one sentence.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
|
||||
@@ -3,6 +3,7 @@ Usage:
|
||||
export OPENAI_API_KEY=sk-******
|
||||
python3 openai_example_chat.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -31,7 +32,7 @@ def stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -40,13 +41,18 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = multi_turn_question.run_batch([
|
||||
{"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions."},
|
||||
|
||||
{"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?"},
|
||||
])
|
||||
states = multi_turn_question.run_batch(
|
||||
[
|
||||
{
|
||||
"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions.",
|
||||
},
|
||||
{
|
||||
"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?",
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s.messages())
|
||||
|
||||
@@ -9,15 +9,14 @@ import sglang as sgl
|
||||
|
||||
@sgl.function
|
||||
def few_shot_qa(s, question):
|
||||
s += (
|
||||
"""The following are questions with answers.
|
||||
s += """The following are questions with answers.
|
||||
Q: What is the capital of France?
|
||||
A: Paris
|
||||
Q: What is the capital of Germany?
|
||||
A: Berlin
|
||||
Q: What is the capital of Italy?
|
||||
A: Rome
|
||||
""")
|
||||
"""
|
||||
s += "Q: " + question + "\n"
|
||||
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
|
||||
|
||||
@@ -33,8 +32,8 @@ def single():
|
||||
|
||||
def stream():
|
||||
state = few_shot_qa.run(
|
||||
question="What is the capital of the United States?",
|
||||
stream=True)
|
||||
question="What is the capital of the United States?", stream=True
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -42,10 +41,12 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = few_shot_qa.run_batch([
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
])
|
||||
states = few_shot_qa.run_batch(
|
||||
[
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s["answer"])
|
||||
|
||||
@@ -3,9 +3,11 @@ Usage:
|
||||
export OPENROUTER_API_KEY=sk-******
|
||||
python3 together_example_chat.py
|
||||
"""
|
||||
import sglang as sgl
|
||||
|
||||
import os
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def multi_turn_question(s, question_1, question_2):
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Usage:
|
||||
python3 srt_example_chat.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -29,7 +30,7 @@ def stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -38,13 +39,18 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = multi_turn_question.run_batch([
|
||||
{"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions."},
|
||||
|
||||
{"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?"},
|
||||
])
|
||||
states = multi_turn_question.run_batch(
|
||||
[
|
||||
{
|
||||
"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions.",
|
||||
},
|
||||
{
|
||||
"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?",
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s.messages())
|
||||
|
||||
@@ -2,20 +2,20 @@
|
||||
Usage:
|
||||
python3 srt_example_complete.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def few_shot_qa(s, question):
|
||||
s += (
|
||||
"""The following are questions with answers.
|
||||
s += """The following are questions with answers.
|
||||
Q: What is the capital of France?
|
||||
A: Paris
|
||||
Q: What is the capital of Germany?
|
||||
A: Berlin
|
||||
Q: What is the capital of Italy?
|
||||
A: Rome
|
||||
""")
|
||||
"""
|
||||
s += "Q: " + question + "\n"
|
||||
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
|
||||
|
||||
@@ -31,8 +31,8 @@ def single():
|
||||
|
||||
def stream():
|
||||
state = few_shot_qa.run(
|
||||
question="What is the capital of the United States?",
|
||||
stream=True)
|
||||
question="What is the capital of the United States?", stream=True
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -40,10 +40,12 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = few_shot_qa.run_batch([
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
])
|
||||
states = few_shot_qa.run_batch(
|
||||
[
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s["answer"])
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Usage: python3 srt_example_llava.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -12,9 +13,8 @@ def image_qa(s, image_path, question):
|
||||
|
||||
def single():
|
||||
state = image_qa.run(
|
||||
image_path="images/cat.jpeg",
|
||||
question="What is this?",
|
||||
max_new_tokens=128)
|
||||
image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128
|
||||
)
|
||||
print(state["answer"], "\n")
|
||||
|
||||
|
||||
@@ -23,7 +23,8 @@ def stream():
|
||||
image_path="images/cat.jpeg",
|
||||
question="What is this?",
|
||||
max_new_tokens=64,
|
||||
stream=True)
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -33,8 +34,8 @@ def stream():
|
||||
def batch():
|
||||
states = image_qa.run_batch(
|
||||
[
|
||||
{"image_path": "images/cat.jpeg", "question":"What is this?"},
|
||||
{"image_path": "images/dog.jpeg", "question":"What is this?"},
|
||||
{"image_path": "images/cat.jpeg", "question": "What is this?"},
|
||||
{"image_path": "images/dog.jpeg", "question": "What is this?"},
|
||||
],
|
||||
max_new_tokens=128,
|
||||
)
|
||||
@@ -43,8 +44,10 @@ def batch():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.6-vicuna-7b",
|
||||
tokenizer_path="llava-hf/llava-1.5-7b-hf")
|
||||
runtime = sgl.Runtime(
|
||||
model_path="liuhaotian/llava-v1.6-vicuna-7b",
|
||||
tokenizer_path="llava-hf/llava-1.5-7b-hf",
|
||||
)
|
||||
sgl.set_default_backend(runtime)
|
||||
print(f"chat template: {runtime.endpoint.chat_template.name}")
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ Usage: python3 srt_example_yi_vl.py
|
||||
|
||||
Requirements: transformers==4.38
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -17,7 +18,8 @@ def single():
|
||||
image_path="images/cat.jpeg",
|
||||
question="What is this?",
|
||||
max_new_tokens=64,
|
||||
stop="###")
|
||||
stop="###",
|
||||
)
|
||||
print(state["answer"], "\n")
|
||||
|
||||
|
||||
@@ -27,7 +29,8 @@ def stream():
|
||||
question="What is this?",
|
||||
max_new_tokens=64,
|
||||
stream=True,
|
||||
stop="###")
|
||||
stop="###",
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -37,11 +40,11 @@ def stream():
|
||||
def batch():
|
||||
states = image_qa.run_batch(
|
||||
[
|
||||
{"image_path": "images/cat.jpeg", "question":"What is this?"},
|
||||
{"image_path": "images/dog.jpeg", "question":"What is this?"},
|
||||
{"image_path": "images/cat.jpeg", "question": "What is this?"},
|
||||
{"image_path": "images/dog.jpeg", "question": "What is this?"},
|
||||
],
|
||||
max_new_tokens=64,
|
||||
stop="###"
|
||||
stop="###",
|
||||
)
|
||||
for s in states:
|
||||
print(s["answer"], "\n")
|
||||
|
||||
@@ -3,9 +3,11 @@ Usage:
|
||||
export TOGETHER_API_KEY=sk-******
|
||||
python3 together_example_chat.py
|
||||
"""
|
||||
import sglang as sgl
|
||||
|
||||
import os
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def multi_turn_question(s, question_1, question_2):
|
||||
@@ -32,7 +34,7 @@ def stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -41,13 +43,18 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = multi_turn_question.run_batch([
|
||||
{"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions."},
|
||||
|
||||
{"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?"},
|
||||
])
|
||||
states = multi_turn_question.run_batch(
|
||||
[
|
||||
{
|
||||
"question_1": "What is the capital of the United States?",
|
||||
"question_2": "List two local attractions.",
|
||||
},
|
||||
{
|
||||
"question_1": "What is the capital of France?",
|
||||
"question_2": "What is the population of this city?",
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s.messages())
|
||||
|
||||
@@ -4,21 +4,21 @@ export TOGETHER_API_KEY=sk-******
|
||||
python3 together_example_complete.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
import os
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def few_shot_qa(s, question):
|
||||
s += (
|
||||
"""The following are questions with answers.
|
||||
s += """The following are questions with answers.
|
||||
Q: What is the capital of France?
|
||||
A: Paris
|
||||
Q: What is the capital of Germany?
|
||||
A: Berlin
|
||||
Q: What is the capital of Italy?
|
||||
A: Rome
|
||||
""")
|
||||
"""
|
||||
s += "Q: " + question + "\n"
|
||||
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
|
||||
|
||||
@@ -34,8 +34,8 @@ def single():
|
||||
|
||||
def stream():
|
||||
state = few_shot_qa.run(
|
||||
question="What is the capital of the United States?",
|
||||
stream=True)
|
||||
question="What is the capital of the United States?", stream=True
|
||||
)
|
||||
|
||||
for out in state.text_iter("answer"):
|
||||
print(out, end="", flush=True)
|
||||
@@ -43,10 +43,12 @@ def stream():
|
||||
|
||||
|
||||
def batch():
|
||||
states = few_shot_qa.run_batch([
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
])
|
||||
states = few_shot_qa.run_batch(
|
||||
[
|
||||
{"question": "What is the capital of the United States?"},
|
||||
{"question": "What is the capital of China?"},
|
||||
]
|
||||
)
|
||||
|
||||
for s in states:
|
||||
print(s["answer"])
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
Usage:
|
||||
python3 async_io.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from sglang import Runtime
|
||||
|
||||
|
||||
@@ -14,7 +16,10 @@ async def generate(
|
||||
tokenizer = engine.get_tokenizer()
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You will be given question answer tasks.",},
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You will be given question answer tasks.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
@@ -36,5 +41,5 @@ if __name__ == "__main__":
|
||||
prompt = "Who is Alan Turing?"
|
||||
sampling_params = {"max_new_tokens": 128}
|
||||
asyncio.run(generate(runtime, prompt, sampling_params))
|
||||
|
||||
|
||||
runtime.shutdown()
|
||||
|
||||
@@ -33,8 +33,7 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
|
||||
)
|
||||
logprobs = step_0.get_meta_info("get_top_k")["decode_top_logprobs"][0]
|
||||
|
||||
print("Decoding step 0:",
|
||||
", ".join(pformat(token[2]) for token in logprobs))
|
||||
print("Decoding step 0:", ", ".join(pformat(token[2]) for token in logprobs))
|
||||
for idx, (f, token) in enumerate(zip(forks, logprobs)):
|
||||
logprob, token_id, text = token
|
||||
f += text
|
||||
@@ -56,17 +55,9 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
|
||||
)
|
||||
|
||||
# calculate probability disparity between the top and secondary tokens
|
||||
x1s = [
|
||||
exp(xt[0][0])
|
||||
for xt in f.get_meta_info("answer")["decode_top_logprobs"]
|
||||
]
|
||||
x2s = [
|
||||
exp(xt[1][0])
|
||||
for xt in f.get_meta_info("answer")["decode_top_logprobs"]
|
||||
]
|
||||
tokens = [
|
||||
xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]
|
||||
]
|
||||
x1s = [exp(xt[0][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
|
||||
x2s = [exp(xt[1][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
|
||||
tokens = [xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
|
||||
delta = (sum(x1s) - sum(x2s)) / len(x1s)
|
||||
|
||||
# extract the answer span (without the '<|end_of_text|>' token)
|
||||
@@ -79,42 +70,45 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
|
||||
top_logprobs_num=2,
|
||||
return_text_in_logprobs=True,
|
||||
)
|
||||
answer = answer_forks[idx]['answer_span'].replace('\n', ' ').strip(':')
|
||||
answer = answer_forks[idx]["answer_span"].replace("\n", " ").strip(":")
|
||||
print(
|
||||
f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score={delta}, answer={answer}){CLEAR}"
|
||||
)
|
||||
generated_text = str(answer_forks[idx])[len("ProgramState("):-1]
|
||||
generated_text = str(answer_forks[idx])[len("ProgramState(") : -1]
|
||||
print(f"{BLUE}{pformat(generated_text)}{CLEAR}")
|
||||
|
||||
if verbose:
|
||||
answer_tokens = [
|
||||
xt[0][2] for xt in answer_forks[idx].get_meta_info(
|
||||
"answer_span")["decode_top_logprobs"]
|
||||
xt[0][2]
|
||||
for xt in answer_forks[idx].get_meta_info("answer_span")[
|
||||
"decode_top_logprobs"
|
||||
]
|
||||
]
|
||||
answer_x1s = [
|
||||
exp(xt[0][0]) for xt in answer_forks[idx].get_meta_info(
|
||||
"answer_span")["decode_top_logprobs"]
|
||||
exp(xt[0][0])
|
||||
for xt in answer_forks[idx].get_meta_info("answer_span")[
|
||||
"decode_top_logprobs"
|
||||
]
|
||||
]
|
||||
answer_x2s = [
|
||||
exp(xt[1][0]) for xt in answer_forks[idx].get_meta_info(
|
||||
"answer_span")["decode_top_logprobs"]
|
||||
exp(xt[1][0])
|
||||
for xt in answer_forks[idx].get_meta_info("answer_span")[
|
||||
"decode_top_logprobs"
|
||||
]
|
||||
]
|
||||
|
||||
for token, x1, x2 in zip(tokens, x1s, x2s):
|
||||
print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
|
||||
end="")
|
||||
print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
|
||||
print("\n===========")
|
||||
for token, x1, x2 in zip(answer_tokens, answer_x1s, answer_x2s):
|
||||
print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
|
||||
end="")
|
||||
print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
|
||||
print()
|
||||
|
||||
|
||||
sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
|
||||
|
||||
state = cot_decoding.run(
|
||||
question=
|
||||
r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?",
|
||||
question=r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?",
|
||||
get_top_k=10,
|
||||
is_chat_model=True,
|
||||
verbose=False,
|
||||
|
||||
@@ -3,10 +3,12 @@ Usage:
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||
python json_decode.py
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
import sglang as sgl
|
||||
from pydantic import BaseModel
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.constrained import build_regex_from_object
|
||||
|
||||
character_regex = (
|
||||
|
||||
@@ -14,16 +14,13 @@ Output:
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import time
|
||||
import copy
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
from llava.conversation import (
|
||||
conv_llava_llama_3,
|
||||
)
|
||||
from llava.conversation import conv_llava_llama_3
|
||||
|
||||
|
||||
async def send_request(url, data, delay=0):
|
||||
|
||||
@@ -14,16 +14,13 @@ Output:
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import time
|
||||
import copy
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
|
||||
from llava.conversation import (
|
||||
conv_qwen
|
||||
)
|
||||
from llava.conversation import conv_qwen
|
||||
|
||||
|
||||
async def send_request(url, data, delay=0):
|
||||
|
||||
@@ -2,13 +2,15 @@
|
||||
Usage: python3 srt_example_llava.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.srt.utils import load_image
|
||||
from sglang.lang.chat_template import get_chat_template
|
||||
|
||||
from PIL import ImageFile
|
||||
|
||||
import sglang as sgl
|
||||
from sglang.lang.chat_template import get_chat_template
|
||||
from sglang.srt.utils import load_image
|
||||
|
||||
ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow loading of truncated images
|
||||
|
||||
|
||||
@sgl.function
|
||||
def image_qa(s, image, question):
|
||||
s += sgl.user(sgl.image(image) + question)
|
||||
|
||||
@@ -2,15 +2,17 @@
|
||||
Usage: python3 srt_example_llava.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
import os
|
||||
import csv
|
||||
import time
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
import time
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def video_qa(s, num_frames, video_path, question):
|
||||
s += sgl.user(sgl.video(video_path,num_frames) + question)
|
||||
s += sgl.user(sgl.video(video_path, num_frames) + question)
|
||||
s += sgl.assistant(sgl.gen("answer"))
|
||||
|
||||
|
||||
@@ -25,7 +27,6 @@ def single(path, num_frames=16):
|
||||
print(state["answer"], "\n")
|
||||
|
||||
|
||||
|
||||
def split_into_chunks(lst, num_chunks):
|
||||
"""Split a list into a specified number of chunks."""
|
||||
# Calculate the chunk size using integer division. Note that this may drop some items if not evenly divisible.
|
||||
@@ -34,7 +35,7 @@ def split_into_chunks(lst, num_chunks):
|
||||
if chunk_size == 0:
|
||||
chunk_size = len(lst)
|
||||
# Use list comprehension to generate chunks. The last chunk will take any remainder if the list size isn't evenly divisible.
|
||||
chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
|
||||
chunks = [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
|
||||
# Ensure we have exactly num_chunks chunks, even if some are empty
|
||||
chunks.extend([[] for _ in range(num_chunks - len(chunks))])
|
||||
return chunks
|
||||
@@ -42,67 +43,73 @@ def split_into_chunks(lst, num_chunks):
|
||||
|
||||
def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, save_dir):
|
||||
csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
|
||||
with open(csv_filename, 'w', newline='') as csvfile:
|
||||
with open(csv_filename, "w", newline="") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['video_name', 'answer'])
|
||||
writer.writerow(["video_name", "answer"])
|
||||
for video_path, state in zip(batch_video_files, states):
|
||||
video_name = os.path.basename(video_path)
|
||||
writer.writerow([video_name, state["answer"]])
|
||||
|
||||
|
||||
def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir):
|
||||
final_csv_filename = f"{save_dir}/final_results_chunk_{cur_chunk}.csv"
|
||||
with open(final_csv_filename, 'w', newline='') as final_csvfile:
|
||||
with open(final_csv_filename, "w", newline="") as final_csvfile:
|
||||
writer = csv.writer(final_csvfile)
|
||||
writer.writerow(['video_name', 'answer'])
|
||||
writer.writerow(["video_name", "answer"])
|
||||
for batch_idx in range(num_batches):
|
||||
batch_csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
|
||||
with open(batch_csv_filename, 'r') as batch_csvfile:
|
||||
with open(batch_csv_filename, "r") as batch_csvfile:
|
||||
reader = csv.reader(batch_csvfile)
|
||||
next(reader) # Skip header row
|
||||
for row in reader:
|
||||
writer.writerow(row)
|
||||
os.remove(batch_csv_filename)
|
||||
|
||||
|
||||
def find_video_files(video_dir):
|
||||
# Check if the video_dir is actually a file
|
||||
if os.path.isfile(video_dir):
|
||||
# If it's a file, return it as a single-element list
|
||||
return [video_dir]
|
||||
|
||||
|
||||
# Original logic to find video files in a directory
|
||||
video_files = []
|
||||
for root, dirs, files in os.walk(video_dir):
|
||||
for file in files:
|
||||
if file.endswith(('.mp4', '.avi', '.mov')):
|
||||
if file.endswith((".mp4", ".avi", ".mov")):
|
||||
video_files.append(os.path.join(root, file))
|
||||
return video_files
|
||||
|
||||
|
||||
def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=64):
|
||||
video_files = find_video_files(video_dir)
|
||||
chunked_video_files = split_into_chunks(video_files, num_chunks)[cur_chunk]
|
||||
num_batches = 0
|
||||
|
||||
for i in range(0, len(chunked_video_files), batch_size):
|
||||
batch_video_files = chunked_video_files[i:i + batch_size]
|
||||
batch_video_files = chunked_video_files[i : i + batch_size]
|
||||
print(f"Processing batch of {len(batch_video_files)} video(s)...")
|
||||
|
||||
if not batch_video_files:
|
||||
print("No video files found in the specified directory.")
|
||||
return
|
||||
|
||||
|
||||
batch_input = [
|
||||
{
|
||||
{
|
||||
"num_frames": num_frames,
|
||||
"video_path": video_path,
|
||||
"question": "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes.",
|
||||
} for video_path in batch_video_files
|
||||
}
|
||||
for video_path in batch_video_files
|
||||
]
|
||||
|
||||
start_time = time.time()
|
||||
states = video_qa.run_batch(batch_input, max_new_tokens=512, temperature=0.2)
|
||||
total_time = time.time() - start_time
|
||||
average_time = total_time / len(batch_video_files)
|
||||
print(f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds")
|
||||
print(
|
||||
f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds"
|
||||
)
|
||||
|
||||
save_batch_results(batch_video_files, states, cur_chunk, num_batches, save_dir)
|
||||
num_batches += 1
|
||||
@@ -113,16 +120,47 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Create the parser
|
||||
parser = argparse.ArgumentParser(description='Run video processing with specified port.')
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run video processing with specified port."
|
||||
)
|
||||
|
||||
# Add an argument for the port
|
||||
parser.add_argument('--port', type=int, default=30000, help='The master port for distributed serving.')
|
||||
parser.add_argument('--chunk-idx', type=int, default=0, help='The index of the chunk to process.')
|
||||
parser.add_argument('--num-chunks', type=int, default=8, help='The number of chunks to process.')
|
||||
parser.add_argument('--save-dir', type=str, default="./work_dirs/llava_video", help='The directory to save the processed video files.')
|
||||
parser.add_argument('--video-dir', type=str, default="./videos/Q98Z4OTh8RwmDonc.mp4", help='The directory or path for the processed video files.')
|
||||
parser.add_argument('--model-path', type=str, default="lmms-lab/LLaVA-NeXT-Video-7B", help='The model path for the video processing.')
|
||||
parser.add_argument('--num-frames', type=int, default=16, help='The number of frames to process in each video.' )
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=30000,
|
||||
help="The master port for distributed serving.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-idx", type=int, default=0, help="The index of the chunk to process."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-chunks", type=int, default=8, help="The number of chunks to process."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-dir",
|
||||
type=str,
|
||||
default="./work_dirs/llava_video",
|
||||
help="The directory to save the processed video files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--video-dir",
|
||||
type=str,
|
||||
default="./videos/Q98Z4OTh8RwmDonc.mp4",
|
||||
help="The directory or path for the processed video files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
default="lmms-lab/LLaVA-NeXT-Video-7B",
|
||||
help="The model path for the video processing.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-frames",
|
||||
type=int,
|
||||
default=16,
|
||||
help="The number of frames to process in each video.",
|
||||
)
|
||||
parser.add_argument("--mm_spatial_pool_stride", type=int, default=2)
|
||||
|
||||
# Parse the arguments
|
||||
@@ -154,7 +192,6 @@ if __name__ == "__main__":
|
||||
if "34b" in args.model_path.lower():
|
||||
model_overide_args["image_token_index"] = 64002
|
||||
|
||||
|
||||
if args.num_frames == 32:
|
||||
model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
|
||||
model_overide_args["max_sequence_length"] = 4096 * 2
|
||||
@@ -162,22 +199,22 @@ if __name__ == "__main__":
|
||||
elif args.num_frames < 32:
|
||||
pass
|
||||
else:
|
||||
print("The maximum number of frames to process is 32. Please specify a valid number of frames.")
|
||||
print(
|
||||
"The maximum number of frames to process is 32. Please specify a valid number of frames."
|
||||
)
|
||||
exit()
|
||||
|
||||
|
||||
runtime = sgl.Runtime(
|
||||
model_path=args.model_path, #"liuhaotian/llava-v1.6-vicuna-7b",
|
||||
model_path=args.model_path, # "liuhaotian/llava-v1.6-vicuna-7b",
|
||||
tokenizer_path=tokenizer_path,
|
||||
port=cur_port,
|
||||
additional_ports=[cur_port+1,cur_port+2,cur_port+3,cur_port+4],
|
||||
additional_ports=[cur_port + 1, cur_port + 2, cur_port + 3, cur_port + 4],
|
||||
model_overide_args=model_overide_args,
|
||||
tp_size=1
|
||||
tp_size=1,
|
||||
)
|
||||
sgl.set_default_backend(runtime)
|
||||
print(f"chat template: {runtime.endpoint.chat_template.name}")
|
||||
|
||||
|
||||
# Run a single request
|
||||
# try:
|
||||
print("\n========== single ==========\n")
|
||||
@@ -185,24 +222,29 @@ if __name__ == "__main__":
|
||||
if os.path.isfile(root):
|
||||
video_files = [root]
|
||||
else:
|
||||
video_files = [os.path.join(root, f) for f in os.listdir(root) if f.endswith(('.mp4', '.avi', '.mov'))] # Add more extensions if needed
|
||||
video_files = [
|
||||
os.path.join(root, f)
|
||||
for f in os.listdir(root)
|
||||
if f.endswith((".mp4", ".avi", ".mov"))
|
||||
] # Add more extensions if needed
|
||||
start_time = time.time() # Start time for processing a single video
|
||||
for cur_video in video_files[:1]:
|
||||
print(cur_video)
|
||||
single(cur_video, num_frames)
|
||||
end_time = time.time() # End time for processing a single video
|
||||
total_time = end_time - start_time
|
||||
average_time = total_time / len(video_files) # Calculate the average processing time
|
||||
average_time = total_time / len(
|
||||
video_files
|
||||
) # Calculate the average processing time
|
||||
print(f"Average processing time per video: {average_time:.2f} seconds")
|
||||
runtime.shutdown()
|
||||
# except Exception as e:
|
||||
# print(e)
|
||||
runtime.shutdown()
|
||||
|
||||
|
||||
# # # Run a batch of requests
|
||||
# print("\n========== batch ==========\n")
|
||||
# if not os.path.exists(args.save_dir):
|
||||
# os.makedirs(args.save_dir)
|
||||
# batch(args.video_dir,args.save_dir,cur_chunk, num_chunks, num_frames, num_chunks)
|
||||
# runtime.shutdown()
|
||||
# runtime.shutdown()
|
||||
|
||||
@@ -15,23 +15,40 @@ incorrect:
|
||||
export OPENAI_API_KEY=sk-******
|
||||
python3 openai_chat_speculative.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
from sglang import function, set_default_backend, OpenAI
|
||||
from sglang import OpenAI, function, set_default_backend
|
||||
|
||||
|
||||
@function(num_api_spec_tokens=256)
|
||||
def gen_character_spec(s):
|
||||
s += sgl.system("You are a helpful assistant.")
|
||||
s += sgl.user("Construct a character within the following format:")
|
||||
s += sgl.assistant("Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n")
|
||||
s += sgl.assistant(
|
||||
"Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
||||
)
|
||||
s += sgl.user("Please generate new Name, Birthday and Job.\n")
|
||||
s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
|
||||
s += sgl.assistant(
|
||||
"Name:"
|
||||
+ sgl.gen("name", stop="\n")
|
||||
+ "\nBirthday:"
|
||||
+ sgl.gen("birthday", stop="\n")
|
||||
+ "\nJob:"
|
||||
+ sgl.gen("job", stop="\n")
|
||||
)
|
||||
|
||||
|
||||
@function(num_api_spec_tokens=256)
|
||||
def gen_character_spec_no_few_shot(s):
|
||||
s += sgl.user("Construct a character. For each field stop with a newline\n")
|
||||
s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nAge:" + sgl.gen("age", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
|
||||
s += sgl.assistant(
|
||||
"Name:"
|
||||
+ sgl.gen("name", stop="\n")
|
||||
+ "\nAge:"
|
||||
+ sgl.gen("age", stop="\n")
|
||||
+ "\nJob:"
|
||||
+ sgl.gen("job", stop="\n")
|
||||
)
|
||||
|
||||
|
||||
@function
|
||||
@@ -45,10 +62,19 @@ def gen_character_normal(s):
|
||||
def multi_turn_question(s, question_1, question_2):
|
||||
s += sgl.system("You are a helpful assistant.")
|
||||
s += sgl.user("Answer questions in the following format:")
|
||||
s += sgl.user("Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n")
|
||||
s += sgl.assistant("Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n")
|
||||
s += sgl.user("Question 1: " + question_1+"\nQuestion 2: " + question_2)
|
||||
s += sgl.assistant("Answer 1: " + sgl.gen("answer_1", stop="\n") + "\nAnswer 2: " + sgl.gen("answer_2", stop="\n"))
|
||||
s += sgl.user(
|
||||
"Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n"
|
||||
)
|
||||
s += sgl.assistant(
|
||||
"Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n"
|
||||
)
|
||||
s += sgl.user("Question 1: " + question_1 + "\nQuestion 2: " + question_2)
|
||||
s += sgl.assistant(
|
||||
"Answer 1: "
|
||||
+ sgl.gen("answer_1", stop="\n")
|
||||
+ "\nAnswer 2: "
|
||||
+ sgl.gen("answer_2", stop="\n")
|
||||
)
|
||||
|
||||
|
||||
def test_spec_single_turn():
|
||||
@@ -97,7 +123,7 @@ def test_spec_multi_turn_stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -126,4 +152,4 @@ if __name__ == "__main__":
|
||||
|
||||
print("\n========== test spec multi turn stream ==========\n")
|
||||
# expect error in stream_executor: stream is not supported...
|
||||
test_spec_multi_turn_stream()
|
||||
test_spec_multi_turn_stream()
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
Usage:
|
||||
python3 openai_speculative.py
|
||||
"""
|
||||
from sglang import function, gen, set_default_backend, OpenAI
|
||||
|
||||
from sglang import OpenAI, function, gen, set_default_backend
|
||||
|
||||
|
||||
@function(num_api_spec_tokens=64)
|
||||
@@ -35,7 +36,11 @@ if __name__ == "__main__":
|
||||
backend = OpenAI("gpt-3.5-turbo-instruct")
|
||||
set_default_backend(backend)
|
||||
|
||||
for function in [gen_character_spec, gen_character_no_spec, gen_character_spec_no_few_shot]:
|
||||
for function in [
|
||||
gen_character_spec,
|
||||
gen_character_no_spec,
|
||||
gen_character_spec_no_few_shot,
|
||||
]:
|
||||
backend.token_usage.reset()
|
||||
|
||||
print(f"function: {function.func.__name__}")
|
||||
@@ -46,4 +51,4 @@ if __name__ == "__main__":
|
||||
print("...birthday:", state["birthday"])
|
||||
print("...job:", state["job"])
|
||||
print(backend.token_usage)
|
||||
print()
|
||||
print()
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Usage:
|
||||
python3 parallel_sample.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -12,7 +13,6 @@ def parallel_sample(s, question, n):
|
||||
"Reasoning: I need to use a calculator.\n"
|
||||
"Tool: calculator\n"
|
||||
"Answer: 6\n"
|
||||
|
||||
"Question: Compute 3 + 2 + 2\n"
|
||||
"Reasoning: I will try a calculator.\n"
|
||||
"Tool: calculator\n"
|
||||
@@ -27,13 +27,9 @@ def parallel_sample(s, question, n):
|
||||
|
||||
|
||||
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
||||
#sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
|
||||
# sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
|
||||
|
||||
state = parallel_sample.run(
|
||||
question="Compute 5 + 2 + 4.",
|
||||
n=5,
|
||||
temperature=1.0
|
||||
)
|
||||
state = parallel_sample.run(question="Compute 5 + 2 + 4.", n=5, temperature=1.0)
|
||||
|
||||
for i in range(5):
|
||||
obj = {
|
||||
|
||||
@@ -3,13 +3,18 @@ Usage:
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||
python readme_examples.py
|
||||
"""
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@sgl.function
|
||||
def tool_use(s, question):
|
||||
s += "To answer this question: " + question + ". "
|
||||
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"]) + ". "
|
||||
s += (
|
||||
"I need to use a "
|
||||
+ sgl.gen("tool", choices=["calculator", "search engine"])
|
||||
+ ". "
|
||||
)
|
||||
|
||||
if s["tool"] == "calculator":
|
||||
s += "The math expression is" + sgl.gen("expression")
|
||||
@@ -75,7 +80,7 @@ def driver_batching():
|
||||
{"question": "What is the capital of France?"},
|
||||
{"question": "What is the capital of Japan?"},
|
||||
],
|
||||
progress_bar=True
|
||||
progress_bar=True,
|
||||
)
|
||||
|
||||
for s in states:
|
||||
@@ -85,9 +90,7 @@ def driver_batching():
|
||||
|
||||
def driver_stream():
|
||||
state = text_qa.run(
|
||||
question="What is the capital of France?",
|
||||
temperature=0.1,
|
||||
stream=True
|
||||
question="What is the capital of France?", temperature=0.1, stream=True
|
||||
)
|
||||
|
||||
for out in state.text_iter():
|
||||
@@ -96,7 +99,7 @@ def driver_stream():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
||||
# sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
|
||||
sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
|
||||
|
||||
driver_tool_use()
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
Usage:
|
||||
python3 streaming.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
import sglang as sgl
|
||||
|
||||
|
||||
@@ -22,7 +24,7 @@ def stream_a_variable():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for out in state.text_iter(var_name="answer_2"):
|
||||
@@ -34,7 +36,7 @@ async def async_stream():
|
||||
state = multi_turn_question.run(
|
||||
question_1="What is the capital of the United States?",
|
||||
question_2="List two local attractions.",
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for out in state.text_async_iter(var_name="answer_2"):
|
||||
|
||||
@@ -1,45 +1,55 @@
|
||||
import triton_python_backend_utils as pb_utils
|
||||
import numpy
|
||||
import triton_python_backend_utils as pb_utils
|
||||
from pydantic import BaseModel
|
||||
|
||||
import sglang as sgl
|
||||
from sglang import function, set_default_backend
|
||||
from sglang.srt.constrained import build_regex_from_object
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
|
||||
|
||||
|
||||
class Character(BaseModel):
|
||||
name: str
|
||||
eye_color: str
|
||||
house: str
|
||||
|
||||
|
||||
@function
|
||||
def character_gen(s, name):
|
||||
s += (
|
||||
name
|
||||
+ " is a character in Harry Potter. Please fill in the following information about this character.\n"
|
||||
)
|
||||
s += sgl.gen("json_output", max_tokens=256, regex=build_regex_from_object(Character))
|
||||
s += sgl.gen(
|
||||
"json_output", max_tokens=256, regex=build_regex_from_object(Character)
|
||||
)
|
||||
|
||||
|
||||
class TritonPythonModel:
|
||||
def initialize(self, args):
|
||||
print("Initialized.")
|
||||
|
||||
def execute(self, requests):
|
||||
responses = []
|
||||
for request in requests:
|
||||
tensor_in = pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT")
|
||||
if tensor_in is None:
|
||||
return pb_utils.InferenceResponse(output_tensors=[])
|
||||
|
||||
input_list_names = [i.decode('utf-8') if isinstance(i, bytes) else i for i in tensor_in.as_numpy().tolist()]
|
||||
|
||||
input_list_dicts = [{"name":i} for i in input_list_names]
|
||||
input_list_names = [
|
||||
i.decode("utf-8") if isinstance(i, bytes) else i
|
||||
for i in tensor_in.as_numpy().tolist()
|
||||
]
|
||||
|
||||
input_list_dicts = [{"name": i} for i in input_list_names]
|
||||
|
||||
states = character_gen.run_batch(input_list_dicts)
|
||||
character_strs = [state.text() for state in states]
|
||||
|
||||
tensor_out = pb_utils.Tensor("OUTPUT_TEXT", numpy.array(character_strs, dtype=object))
|
||||
tensor_out = pb_utils.Tensor(
|
||||
"OUTPUT_TEXT", numpy.array(character_strs, dtype=object)
|
||||
)
|
||||
|
||||
responses.append(pb_utils.InferenceResponse(output_tensors = [tensor_out]))
|
||||
return responses
|
||||
responses.append(pb_utils.InferenceResponse(output_tensors=[tensor_out]))
|
||||
return responses
|
||||
|
||||
@@ -3,11 +3,12 @@ import code
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
parser.add_argument(
|
||||
"--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
t = get_tokenizer(args.name)
|
||||
code.interact(local=locals())
|
||||
code.interact(local=locals())
|
||||
|
||||
@@ -183,14 +183,18 @@ class CudaGraphRunner:
|
||||
else:
|
||||
output = LogitProcessorOutput(
|
||||
next_token_logits=output.next_token_logits[:raw_bs],
|
||||
next_token_logprobs=output.next_token_logprobs[:raw_bs]
|
||||
if output.next_token_logprobs is not None
|
||||
else None,
|
||||
next_token_logprobs=(
|
||||
output.next_token_logprobs[:raw_bs]
|
||||
if output.next_token_logprobs is not None
|
||||
else None
|
||||
),
|
||||
normalized_prompt_logprobs=None,
|
||||
prefill_token_logprobs=None,
|
||||
prefill_top_logprobs=None,
|
||||
decode_top_logprobs=output.decode_top_logprobs[:raw_bs]
|
||||
if output.decode_top_logprobs is not None
|
||||
else None,
|
||||
decode_top_logprobs=(
|
||||
output.decode_top_logprobs[:raw_bs]
|
||||
if output.decode_top_logprobs is not None
|
||||
else None
|
||||
),
|
||||
)
|
||||
return output
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""A controller that manages a group of tensor parallel workers."""
|
||||
|
||||
import multiprocessing
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import pickle
|
||||
|
||||
@@ -11,11 +11,10 @@ import zmq
|
||||
import zmq.asyncio
|
||||
|
||||
from sglang.srt.managers.controller.tp_worker import ModelTpServer
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs, ModelPortArgs
|
||||
from sglang.srt.server_args import ModelPortArgs, PortArgs, ServerArgs
|
||||
from sglang.srt.utils import kill_parent_process
|
||||
from sglang.utils import get_exception_traceback
|
||||
|
||||
|
||||
logger = logging.getLogger("srt.controller")
|
||||
|
||||
|
||||
@@ -45,14 +44,16 @@ def run_tp_server(
|
||||
raise
|
||||
|
||||
|
||||
def launch_tp_servers(gpu_ids, tp_rank_range, server_args,
|
||||
model_port_args, model_overide_args):
|
||||
def launch_tp_servers(
|
||||
gpu_ids, tp_rank_range, server_args, model_port_args, model_overide_args
|
||||
):
|
||||
"""Launch multiple tp servers."""
|
||||
procs = []
|
||||
for i in tp_rank_range:
|
||||
proc = multiprocessing.Process(target=run_tp_server, args=(
|
||||
gpu_ids[i], i, server_args, model_port_args, model_overide_args
|
||||
))
|
||||
proc = multiprocessing.Process(
|
||||
target=run_tp_server,
|
||||
args=(gpu_ids[i], i, server_args, model_port_args, model_overide_args),
|
||||
)
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
|
||||
@@ -93,7 +94,9 @@ def broadcast_recv_input(data, rank, dist_group):
|
||||
class ControllerSingle:
|
||||
"""A controller that manages a group of tensor parallel workers."""
|
||||
|
||||
def __init__(self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict):
|
||||
def __init__(
|
||||
self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict
|
||||
):
|
||||
# Parse args
|
||||
self.server_args = server_args
|
||||
self.tp_procs = []
|
||||
@@ -116,8 +119,12 @@ class ControllerSingle:
|
||||
if tp_size_local > 1:
|
||||
tp_rank_range = range(1, tp_size_local)
|
||||
self.tp_procs = launch_tp_servers(
|
||||
gpu_ids, tp_rank_range, server_args,
|
||||
port_args.model_port_args[0], model_overide_args)
|
||||
gpu_ids,
|
||||
tp_rank_range,
|
||||
server_args,
|
||||
port_args.model_port_args[0],
|
||||
model_overide_args,
|
||||
)
|
||||
|
||||
# Launch tp rank 0
|
||||
self.tp_server = ModelTpServer(
|
||||
|
||||
@@ -11,7 +11,11 @@ import torch
|
||||
import torch.nn as nn
|
||||
from vllm.config import DeviceConfig, LoadConfig
|
||||
from vllm.config import ModelConfig as VllmModelConfig
|
||||
from vllm.distributed import init_distributed_environment, initialize_model_parallel, get_tp_group
|
||||
from vllm.distributed import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
|
||||
@@ -89,9 +93,9 @@ class ModelRunner:
|
||||
|
||||
# Set some global args
|
||||
global_server_args_dict["disable_flashinfer"] = server_args.disable_flashinfer
|
||||
global_server_args_dict[
|
||||
"attention_reduce_in_fp32"
|
||||
] = server_args.attention_reduce_in_fp32
|
||||
global_server_args_dict["attention_reduce_in_fp32"] = (
|
||||
server_args.attention_reduce_in_fp32
|
||||
)
|
||||
|
||||
# Load the model and create memory pool
|
||||
self.load_model()
|
||||
|
||||
@@ -241,12 +241,9 @@ class ModelTpServer:
|
||||
|
||||
def print_stats(self):
|
||||
num_used = self.max_total_num_tokens - (
|
||||
self.token_to_kv_pool.available_size()
|
||||
+ self.tree_cache.evictable_size()
|
||||
)
|
||||
throughput = self.num_generated_tokens / (
|
||||
time.time() - self.last_stats_tic
|
||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||
)
|
||||
throughput = self.num_generated_tokens / (time.time() - self.last_stats_tic)
|
||||
self.num_generated_tokens = 0
|
||||
self.last_stats_tic = time.time()
|
||||
logger.info(
|
||||
@@ -260,8 +257,7 @@ class ModelTpServer:
|
||||
|
||||
def check_memory(self):
|
||||
available_size = (
|
||||
self.token_to_kv_pool.available_size()
|
||||
+ self.tree_cache.evictable_size()
|
||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||
)
|
||||
if available_size != self.max_total_num_tokens:
|
||||
warnings.warn(
|
||||
@@ -348,7 +344,8 @@ class ModelTpServer:
|
||||
if self.running_batch:
|
||||
available_size -= sum(
|
||||
[
|
||||
(r.sampling_params.max_new_tokens - len(r.output_ids)) * self.new_token_ratio
|
||||
(r.sampling_params.max_new_tokens - len(r.output_ids))
|
||||
* self.new_token_ratio
|
||||
for r in self.running_batch.reqs
|
||||
]
|
||||
)
|
||||
@@ -370,7 +367,9 @@ class ModelTpServer:
|
||||
req.image_offset += 1
|
||||
|
||||
if (
|
||||
req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens
|
||||
req.extend_input_len
|
||||
+ req.sampling_params.max_new_tokens
|
||||
+ new_batch_total_tokens
|
||||
< available_size
|
||||
and (
|
||||
req.extend_input_len + new_batch_input_tokens
|
||||
@@ -382,7 +381,9 @@ class ModelTpServer:
|
||||
available_size += delta
|
||||
|
||||
if not (
|
||||
req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens
|
||||
req.extend_input_len
|
||||
+ req.sampling_params.max_new_tokens
|
||||
+ new_batch_total_tokens
|
||||
< available_size
|
||||
):
|
||||
# Undo locking
|
||||
|
||||
@@ -335,15 +335,16 @@ class TokenizerManager:
|
||||
)
|
||||
|
||||
if top_logprobs_num > 0:
|
||||
ret["meta_info"][
|
||||
"prefill_top_logprobs"
|
||||
] = self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
|
||||
ret["meta_info"]["prefill_top_logprobs"] = (
|
||||
self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["prefill_top_logprobs"],
|
||||
return_text_in_logprobs,
|
||||
)
|
||||
)
|
||||
ret["meta_info"][
|
||||
"decode_top_logprobs"
|
||||
] = self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
|
||||
ret["meta_info"]["decode_top_logprobs"] = (
|
||||
self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
|
||||
)
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
@@ -21,7 +21,9 @@ class ReqToTokenPool:
|
||||
if need_size > self.can_use_mem_size:
|
||||
return None
|
||||
|
||||
select_index = torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
|
||||
select_index = (
|
||||
torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
|
||||
)
|
||||
self.mem_state[select_index] = False
|
||||
self.can_use_mem_size -= need_size
|
||||
|
||||
@@ -79,7 +81,9 @@ class TokenToKVPool:
|
||||
|
||||
addition_size = need_size - buffer_len
|
||||
alloc_size = max(addition_size, self.prefetch_chunk_size)
|
||||
select_index = torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
|
||||
select_index = (
|
||||
torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
|
||||
)
|
||||
|
||||
if select_index.shape[0] < addition_size:
|
||||
return None
|
||||
|
||||
@@ -163,9 +163,9 @@ class LlamaDecoderLayer(nn.Module):
|
||||
if rope_scaling is not None and getattr(
|
||||
config, "original_max_position_embeddings", None
|
||||
):
|
||||
rope_scaling[
|
||||
"original_max_position_embeddings"
|
||||
] = config.original_max_position_embeddings
|
||||
rope_scaling["original_max_position_embeddings"] = (
|
||||
config.original_max_position_embeddings
|
||||
)
|
||||
rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.self_attn = LlamaAttention(
|
||||
|
||||
@@ -313,7 +313,10 @@ class Qwen2ForCausalLM(nn.Module):
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
if self.config.tie_word_embeddings and name=="model.embed_tokens.weight":
|
||||
if (
|
||||
self.config.tie_word_embeddings
|
||||
and name == "model.embed_tokens.weight"
|
||||
):
|
||||
weight_loader(params_dict["lm_head.weight"], loaded_weight)
|
||||
|
||||
|
||||
|
||||
@@ -401,9 +401,11 @@ class Qwen2MoeForCausalLM(nn.Module):
|
||||
# These are the weights for the experts
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
(
|
||||
"experts.w13_weight"
|
||||
if weight_name in ["gate_proj", "up_proj"]
|
||||
else "experts.w2_weight",
|
||||
(
|
||||
"experts.w13_weight"
|
||||
if weight_name in ["gate_proj", "up_proj"]
|
||||
else "experts.w2_weight"
|
||||
),
|
||||
f"experts.{expert_id}.{weight_name}.weight",
|
||||
expert_id,
|
||||
shard_id,
|
||||
@@ -418,7 +420,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||
# Skip non-stacked layers and experts (experts handled below).
|
||||
if weight_name not in name:
|
||||
continue
|
||||
|
||||
@@ -32,8 +32,8 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.managers.controller.manager_multi import (
|
||||
start_controller_process as start_controller_process_multi,
|
||||
)
|
||||
from sglang.srt.managers.controller.manager_single import launch_tp_servers
|
||||
from sglang.srt.managers.controller.manager_single import (
|
||||
launch_tp_servers,
|
||||
start_controller_process as start_controller_process_single,
|
||||
)
|
||||
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
|
||||
@@ -198,11 +198,22 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
|
||||
|
||||
if server_args.node_rank != 0:
|
||||
tp_size_local = server_args.tp_size // server_args.nnodes
|
||||
gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)]
|
||||
tp_rank_range = list(range(server_args.node_rank * tp_size_local,
|
||||
(server_args.node_rank + 1) * tp_size_local))
|
||||
procs = launch_tp_servers(gpu_ids, tp_rank_range, server_args,
|
||||
port_args.model_port_args[0], model_overide_args)
|
||||
gpu_ids = [
|
||||
i for _ in range(server_args.nnodes) for i in range(tp_size_local)
|
||||
]
|
||||
tp_rank_range = list(
|
||||
range(
|
||||
server_args.node_rank * tp_size_local,
|
||||
(server_args.node_rank + 1) * tp_size_local,
|
||||
)
|
||||
)
|
||||
procs = launch_tp_servers(
|
||||
gpu_ids,
|
||||
tp_rank_range,
|
||||
server_args,
|
||||
port_args.model_port_args[0],
|
||||
model_overide_args,
|
||||
)
|
||||
while True:
|
||||
pass
|
||||
|
||||
|
||||
@@ -10,16 +10,15 @@ import os
|
||||
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
|
||||
def add_image_token(model_path: str):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
tokenizer.add_tokens(
|
||||
["<image_placeholder>"],
|
||||
special_tokens=True
|
||||
)
|
||||
tokenizer.add_tokens(["<image_placeholder>"], special_tokens=True)
|
||||
|
||||
print(tokenizer)
|
||||
tokenizer.save_pretrained(model_path)
|
||||
|
||||
|
||||
def edit_model_config(model_path):
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
|
||||
@@ -29,10 +28,11 @@ def edit_model_config(model_path):
|
||||
print(config)
|
||||
config.save_pretrained(model_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model-path", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
add_image_token(args.model_path)
|
||||
edit_model_config(args.model_path)
|
||||
edit_model_config(args.model_path)
|
||||
|
||||
Reference in New Issue
Block a user