diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..393c999d2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + - repo: https://github.com/psf/black + rev: stable + hooks: + - id: black diff --git a/benchmark/latency_throughput/bench_serving.py b/benchmark/latency_throughput/bench_serving.py index 24816d4bd..8566420ed 100644 --- a/benchmark/latency_throughput/bench_serving.py +++ b/benchmark/latency_throughput/bench_serving.py @@ -312,8 +312,8 @@ def main(args: argparse.Namespace): np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time ) - #latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY] - #print(latencies) + # latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY] + # print(latencies) print(f"Total time: {benchmark_time:.2f} s") print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s") diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py index 5763e6615..c88ecba49 100644 --- a/benchmark/line_retrieval/gen_data.py +++ b/benchmark/line_retrieval/gen_data.py @@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio): ) for i in redirect_indices: target_idx = np.random.choice(min(i * 2 + 100, num_lines)) - lines[ - i - ] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}." + lines[i] = ( + f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}." + ) redirects[i] = target_idx # Build links and find sources diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/quick_start/anthropic_example_chat.py index 03dbb0a45..03d699be7 100644 --- a/examples/quick_start/anthropic_example_chat.py +++ b/examples/quick_start/anthropic_example_chat.py @@ -3,6 +3,7 @@ Usage: export ANTHROPIC_API_KEY=sk-****** python3 anthropic_example_chat.py """ + import sglang as sgl @@ -30,7 +31,7 @@ def stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -39,13 +40,18 @@ def stream(): def batch(): - states = multi_turn_question.run_batch([ - {"question_1": "What is the capital of the United States?", - "question_2": "List two local attractions."}, - - {"question_1": "What is the capital of France?", - "question_2": "What is the population of this city?"}, - ]) + states = multi_turn_question.run_batch( + [ + { + "question_1": "What is the capital of the United States?", + "question_2": "List two local attractions.", + }, + { + "question_1": "What is the capital of France?", + "question_2": "What is the population of this city?", + }, + ] + ) for s in states: print(s.messages()) diff --git a/examples/quick_start/anthropic_example_complete.py b/examples/quick_start/anthropic_example_complete.py index 35d0e8f62..bce2a61ea 100644 --- a/examples/quick_start/anthropic_example_complete.py +++ b/examples/quick_start/anthropic_example_complete.py @@ -9,15 +9,14 @@ import sglang as sgl @sgl.function def few_shot_qa(s, question): - s += ( -""" + s += """ \n\nHuman: What is the capital of France? \n\nAssistant: Paris \n\nHuman: What is the capital of Germany? \n\nAssistant: Berlin \n\nHuman: What is the capital of Italy? \n\nAssistant: Rome -""") +""" s += "\n\nHuman: " + question + "\n" s += "\n\nAssistant:" + sgl.gen("answer", temperature=0) @@ -33,8 +32,8 @@ def single(): def stream(): state = few_shot_qa.run( - question="What is the capital of the United States?", - stream=True) + question="What is the capital of the United States?", stream=True + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -42,10 +41,12 @@ def stream(): def batch(): - states = few_shot_qa.run_batch([ - {"question": "What is the capital of the United States?"}, - {"question": "What is the capital of China?"}, - ]) + states = few_shot_qa.run_batch( + [ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ] + ) for s in states: print(s["answer"]) diff --git a/examples/quick_start/azure_openai_example_chat.py b/examples/quick_start/azure_openai_example_chat.py index 3c40af8d2..d53f935f4 100644 --- a/examples/quick_start/azure_openai_example_chat.py +++ b/examples/quick_start/azure_openai_example_chat.py @@ -3,9 +3,11 @@ Usage: export AZURE_OPENAI_API_KEY=sk-****** python3 openai_example_chat.py """ -import sglang as sgl + import os +import sglang as sgl + @sgl.function def multi_turn_question(s, question_1, question_2): @@ -32,7 +34,7 @@ def stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -41,13 +43,18 @@ def stream(): def batch(): - states = multi_turn_question.run_batch([ - {"question_1": "What is the capital of the United States?", - "question_2": "List two local attractions."}, - - {"question_1": "What is the capital of France?", - "question_2": "What is the population of this city?"}, - ]) + states = multi_turn_question.run_batch( + [ + { + "question_1": "What is the capital of the United States?", + "question_2": "List two local attractions.", + }, + { + "question_1": "What is the capital of France?", + "question_2": "What is the population of this city?", + }, + ] + ) for s in states: print(s.messages()) diff --git a/examples/quick_start/gemini_example_chat.py b/examples/quick_start/gemini_example_chat.py index aafa1665c..0ae623109 100644 --- a/examples/quick_start/gemini_example_chat.py +++ b/examples/quick_start/gemini_example_chat.py @@ -3,6 +3,7 @@ Usage: export GCP_PROJECT_ID=****** python3 gemini_example_chat.py """ + import sglang as sgl @@ -30,7 +31,7 @@ def stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -39,13 +40,18 @@ def stream(): def batch(): - states = multi_turn_question.run_batch([ - {"question_1": "What is the capital of the United States?", - "question_2": "List two local attractions."}, - - {"question_1": "What is the capital of France?", - "question_2": "What is the population of this city?"}, - ]) + states = multi_turn_question.run_batch( + [ + { + "question_1": "What is the capital of the United States?", + "question_2": "List two local attractions.", + }, + { + "question_1": "What is the capital of France?", + "question_2": "What is the population of this city?", + }, + ] + ) for s in states: print(s.messages()) diff --git a/examples/quick_start/gemini_example_complete.py b/examples/quick_start/gemini_example_complete.py index 255a3ad4c..5188bf418 100644 --- a/examples/quick_start/gemini_example_complete.py +++ b/examples/quick_start/gemini_example_complete.py @@ -9,15 +9,14 @@ import sglang as sgl @sgl.function def few_shot_qa(s, question): - s += ( -"""The following are questions with answers. + s += """The following are questions with answers. Q: What is the capital of France? A: Paris Q: What is the capital of Germany? A: Berlin Q: What is the capital of Italy? A: Rome -""") +""" s += "Q: " + question + "\n" s += "A:" + sgl.gen("answer", stop="\n", temperature=0) @@ -33,8 +32,8 @@ def single(): def stream(): state = few_shot_qa.run( - question="What is the capital of the United States?", - stream=True) + question="What is the capital of the United States?", stream=True + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -42,10 +41,12 @@ def stream(): def batch(): - states = few_shot_qa.run_batch([ - {"question": "What is the capital of the United States?"}, - {"question": "What is the capital of China?"}, - ]) + states = few_shot_qa.run_batch( + [ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ] + ) for s in states: print(s["answer"]) diff --git a/examples/quick_start/gemini_example_multimodal_chat.py b/examples/quick_start/gemini_example_multimodal_chat.py index fa5e6e8b7..afe0c723f 100644 --- a/examples/quick_start/gemini_example_multimodal_chat.py +++ b/examples/quick_start/gemini_example_multimodal_chat.py @@ -3,6 +3,7 @@ Usage: export GCP_PROJECT_ID=****** python3 gemini_example_multimodal_chat.py """ + import sglang as sgl @@ -19,7 +20,7 @@ if __name__ == "__main__": image_file1="./images/cat.jpeg", image_file2="./images/dog.jpeg", question="Describe difference of the two images in one sentence.", - stream=True + stream=True, ) for out in state.text_iter("answer"): diff --git a/examples/quick_start/openai_example_chat.py b/examples/quick_start/openai_example_chat.py index 66b8536c0..9511e21cf 100644 --- a/examples/quick_start/openai_example_chat.py +++ b/examples/quick_start/openai_example_chat.py @@ -3,6 +3,7 @@ Usage: export OPENAI_API_KEY=sk-****** python3 openai_example_chat.py """ + import sglang as sgl @@ -31,7 +32,7 @@ def stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -40,13 +41,18 @@ def stream(): def batch(): - states = multi_turn_question.run_batch([ - {"question_1": "What is the capital of the United States?", - "question_2": "List two local attractions."}, - - {"question_1": "What is the capital of France?", - "question_2": "What is the population of this city?"}, - ]) + states = multi_turn_question.run_batch( + [ + { + "question_1": "What is the capital of the United States?", + "question_2": "List two local attractions.", + }, + { + "question_1": "What is the capital of France?", + "question_2": "What is the population of this city?", + }, + ] + ) for s in states: print(s.messages()) diff --git a/examples/quick_start/openai_example_complete.py b/examples/quick_start/openai_example_complete.py index 41b3c9904..d64bcaf1c 100644 --- a/examples/quick_start/openai_example_complete.py +++ b/examples/quick_start/openai_example_complete.py @@ -9,15 +9,14 @@ import sglang as sgl @sgl.function def few_shot_qa(s, question): - s += ( -"""The following are questions with answers. + s += """The following are questions with answers. Q: What is the capital of France? A: Paris Q: What is the capital of Germany? A: Berlin Q: What is the capital of Italy? A: Rome -""") +""" s += "Q: " + question + "\n" s += "A:" + sgl.gen("answer", stop="\n", temperature=0) @@ -33,8 +32,8 @@ def single(): def stream(): state = few_shot_qa.run( - question="What is the capital of the United States?", - stream=True) + question="What is the capital of the United States?", stream=True + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -42,10 +41,12 @@ def stream(): def batch(): - states = few_shot_qa.run_batch([ - {"question": "What is the capital of the United States?"}, - {"question": "What is the capital of China?"}, - ]) + states = few_shot_qa.run_batch( + [ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ] + ) for s in states: print(s["answer"]) diff --git a/examples/quick_start/openrouter_example_chat.py b/examples/quick_start/openrouter_example_chat.py index 43ac3d4e2..a0b6f15bc 100644 --- a/examples/quick_start/openrouter_example_chat.py +++ b/examples/quick_start/openrouter_example_chat.py @@ -3,9 +3,11 @@ Usage: export OPENROUTER_API_KEY=sk-****** python3 together_example_chat.py """ -import sglang as sgl + import os +import sglang as sgl + @sgl.function def multi_turn_question(s, question_1, question_2): diff --git a/examples/quick_start/srt_example_chat.py b/examples/quick_start/srt_example_chat.py index 2f261b095..b1e1658a2 100644 --- a/examples/quick_start/srt_example_chat.py +++ b/examples/quick_start/srt_example_chat.py @@ -2,6 +2,7 @@ Usage: python3 srt_example_chat.py """ + import sglang as sgl @@ -29,7 +30,7 @@ def stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -38,13 +39,18 @@ def stream(): def batch(): - states = multi_turn_question.run_batch([ - {"question_1": "What is the capital of the United States?", - "question_2": "List two local attractions."}, - - {"question_1": "What is the capital of France?", - "question_2": "What is the population of this city?"}, - ]) + states = multi_turn_question.run_batch( + [ + { + "question_1": "What is the capital of the United States?", + "question_2": "List two local attractions.", + }, + { + "question_1": "What is the capital of France?", + "question_2": "What is the population of this city?", + }, + ] + ) for s in states: print(s.messages()) diff --git a/examples/quick_start/srt_example_complete.py b/examples/quick_start/srt_example_complete.py index 200891670..056245979 100644 --- a/examples/quick_start/srt_example_complete.py +++ b/examples/quick_start/srt_example_complete.py @@ -2,20 +2,20 @@ Usage: python3 srt_example_complete.py """ + import sglang as sgl @sgl.function def few_shot_qa(s, question): - s += ( -"""The following are questions with answers. + s += """The following are questions with answers. Q: What is the capital of France? A: Paris Q: What is the capital of Germany? A: Berlin Q: What is the capital of Italy? A: Rome -""") +""" s += "Q: " + question + "\n" s += "A:" + sgl.gen("answer", stop="\n", temperature=0) @@ -31,8 +31,8 @@ def single(): def stream(): state = few_shot_qa.run( - question="What is the capital of the United States?", - stream=True) + question="What is the capital of the United States?", stream=True + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -40,10 +40,12 @@ def stream(): def batch(): - states = few_shot_qa.run_batch([ - {"question": "What is the capital of the United States?"}, - {"question": "What is the capital of China?"}, - ]) + states = few_shot_qa.run_batch( + [ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ] + ) for s in states: print(s["answer"]) diff --git a/examples/quick_start/srt_example_llava.py b/examples/quick_start/srt_example_llava.py index 27685b1d2..5d8f75239 100644 --- a/examples/quick_start/srt_example_llava.py +++ b/examples/quick_start/srt_example_llava.py @@ -1,6 +1,7 @@ """ Usage: python3 srt_example_llava.py """ + import sglang as sgl @@ -12,9 +13,8 @@ def image_qa(s, image_path, question): def single(): state = image_qa.run( - image_path="images/cat.jpeg", - question="What is this?", - max_new_tokens=128) + image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128 + ) print(state["answer"], "\n") @@ -23,7 +23,8 @@ def stream(): image_path="images/cat.jpeg", question="What is this?", max_new_tokens=64, - stream=True) + stream=True, + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -33,8 +34,8 @@ def stream(): def batch(): states = image_qa.run_batch( [ - {"image_path": "images/cat.jpeg", "question":"What is this?"}, - {"image_path": "images/dog.jpeg", "question":"What is this?"}, + {"image_path": "images/cat.jpeg", "question": "What is this?"}, + {"image_path": "images/dog.jpeg", "question": "What is this?"}, ], max_new_tokens=128, ) @@ -43,8 +44,10 @@ def batch(): if __name__ == "__main__": - runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.6-vicuna-7b", - tokenizer_path="llava-hf/llava-1.5-7b-hf") + runtime = sgl.Runtime( + model_path="liuhaotian/llava-v1.6-vicuna-7b", + tokenizer_path="llava-hf/llava-1.5-7b-hf", + ) sgl.set_default_backend(runtime) print(f"chat template: {runtime.endpoint.chat_template.name}") diff --git a/examples/quick_start/srt_example_yi_vl.py b/examples/quick_start/srt_example_yi_vl.py index 359aacac3..66c7d5712 100644 --- a/examples/quick_start/srt_example_yi_vl.py +++ b/examples/quick_start/srt_example_yi_vl.py @@ -3,6 +3,7 @@ Usage: python3 srt_example_yi_vl.py Requirements: transformers==4.38 """ + import sglang as sgl @@ -17,7 +18,8 @@ def single(): image_path="images/cat.jpeg", question="What is this?", max_new_tokens=64, - stop="###") + stop="###", + ) print(state["answer"], "\n") @@ -27,7 +29,8 @@ def stream(): question="What is this?", max_new_tokens=64, stream=True, - stop="###") + stop="###", + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -37,11 +40,11 @@ def stream(): def batch(): states = image_qa.run_batch( [ - {"image_path": "images/cat.jpeg", "question":"What is this?"}, - {"image_path": "images/dog.jpeg", "question":"What is this?"}, + {"image_path": "images/cat.jpeg", "question": "What is this?"}, + {"image_path": "images/dog.jpeg", "question": "What is this?"}, ], max_new_tokens=64, - stop="###" + stop="###", ) for s in states: print(s["answer"], "\n") diff --git a/examples/quick_start/together_example_chat.py b/examples/quick_start/together_example_chat.py index d2834f44e..2d2059062 100644 --- a/examples/quick_start/together_example_chat.py +++ b/examples/quick_start/together_example_chat.py @@ -3,9 +3,11 @@ Usage: export TOGETHER_API_KEY=sk-****** python3 together_example_chat.py """ -import sglang as sgl + import os +import sglang as sgl + @sgl.function def multi_turn_question(s, question_1, question_2): @@ -32,7 +34,7 @@ def stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -41,13 +43,18 @@ def stream(): def batch(): - states = multi_turn_question.run_batch([ - {"question_1": "What is the capital of the United States?", - "question_2": "List two local attractions."}, - - {"question_1": "What is the capital of France?", - "question_2": "What is the population of this city?"}, - ]) + states = multi_turn_question.run_batch( + [ + { + "question_1": "What is the capital of the United States?", + "question_2": "List two local attractions.", + }, + { + "question_1": "What is the capital of France?", + "question_2": "What is the population of this city?", + }, + ] + ) for s in states: print(s.messages()) diff --git a/examples/quick_start/together_example_complete.py b/examples/quick_start/together_example_complete.py index 011c652fd..d9119ed6c 100644 --- a/examples/quick_start/together_example_complete.py +++ b/examples/quick_start/together_example_complete.py @@ -4,21 +4,21 @@ export TOGETHER_API_KEY=sk-****** python3 together_example_complete.py """ -import sglang as sgl import os +import sglang as sgl + @sgl.function def few_shot_qa(s, question): - s += ( -"""The following are questions with answers. + s += """The following are questions with answers. Q: What is the capital of France? A: Paris Q: What is the capital of Germany? A: Berlin Q: What is the capital of Italy? A: Rome -""") +""" s += "Q: " + question + "\n" s += "A:" + sgl.gen("answer", stop="\n", temperature=0) @@ -34,8 +34,8 @@ def single(): def stream(): state = few_shot_qa.run( - question="What is the capital of the United States?", - stream=True) + question="What is the capital of the United States?", stream=True + ) for out in state.text_iter("answer"): print(out, end="", flush=True) @@ -43,10 +43,12 @@ def stream(): def batch(): - states = few_shot_qa.run_batch([ - {"question": "What is the capital of the United States?"}, - {"question": "What is the capital of China?"}, - ]) + states = few_shot_qa.run_batch( + [ + {"question": "What is the capital of the United States?"}, + {"question": "What is the capital of China?"}, + ] + ) for s in states: print(s["answer"]) diff --git a/examples/usage/async_io.py b/examples/usage/async_io.py index 68714812f..d12a3a4d9 100644 --- a/examples/usage/async_io.py +++ b/examples/usage/async_io.py @@ -2,7 +2,9 @@ Usage: python3 async_io.py """ + import asyncio + from sglang import Runtime @@ -14,7 +16,10 @@ async def generate( tokenizer = engine.get_tokenizer() messages = [ - {"role": "system", "content": "You will be given question answer tasks.",}, + { + "role": "system", + "content": "You will be given question answer tasks.", + }, {"role": "user", "content": prompt}, ] @@ -36,5 +41,5 @@ if __name__ == "__main__": prompt = "Who is Alan Turing?" sampling_params = {"max_new_tokens": 128} asyncio.run(generate(runtime, prompt, sampling_params)) - + runtime.shutdown() diff --git a/examples/usage/cot_decoding.py b/examples/usage/cot_decoding.py index d81a813c8..5f9cd68d4 100644 --- a/examples/usage/cot_decoding.py +++ b/examples/usage/cot_decoding.py @@ -33,8 +33,7 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose): ) logprobs = step_0.get_meta_info("get_top_k")["decode_top_logprobs"][0] - print("Decoding step 0:", - ", ".join(pformat(token[2]) for token in logprobs)) + print("Decoding step 0:", ", ".join(pformat(token[2]) for token in logprobs)) for idx, (f, token) in enumerate(zip(forks, logprobs)): logprob, token_id, text = token f += text @@ -56,17 +55,9 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose): ) # calculate probability disparity between the top and secondary tokens - x1s = [ - exp(xt[0][0]) - for xt in f.get_meta_info("answer")["decode_top_logprobs"] - ] - x2s = [ - exp(xt[1][0]) - for xt in f.get_meta_info("answer")["decode_top_logprobs"] - ] - tokens = [ - xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"] - ] + x1s = [exp(xt[0][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]] + x2s = [exp(xt[1][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]] + tokens = [xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]] delta = (sum(x1s) - sum(x2s)) / len(x1s) # extract the answer span (without the '<|end_of_text|>' token) @@ -79,42 +70,45 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose): top_logprobs_num=2, return_text_in_logprobs=True, ) - answer = answer_forks[idx]['answer_span'].replace('\n', ' ').strip(':') + answer = answer_forks[idx]["answer_span"].replace("\n", " ").strip(":") print( f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score={delta}, answer={answer}){CLEAR}" ) - generated_text = str(answer_forks[idx])[len("ProgramState("):-1] + generated_text = str(answer_forks[idx])[len("ProgramState(") : -1] print(f"{BLUE}{pformat(generated_text)}{CLEAR}") if verbose: answer_tokens = [ - xt[0][2] for xt in answer_forks[idx].get_meta_info( - "answer_span")["decode_top_logprobs"] + xt[0][2] + for xt in answer_forks[idx].get_meta_info("answer_span")[ + "decode_top_logprobs" + ] ] answer_x1s = [ - exp(xt[0][0]) for xt in answer_forks[idx].get_meta_info( - "answer_span")["decode_top_logprobs"] + exp(xt[0][0]) + for xt in answer_forks[idx].get_meta_info("answer_span")[ + "decode_top_logprobs" + ] ] answer_x2s = [ - exp(xt[1][0]) for xt in answer_forks[idx].get_meta_info( - "answer_span")["decode_top_logprobs"] + exp(xt[1][0]) + for xt in answer_forks[idx].get_meta_info("answer_span")[ + "decode_top_logprobs" + ] ] for token, x1, x2 in zip(tokens, x1s, x2s): - print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", - end="") + print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="") print("\n===========") for token, x1, x2 in zip(answer_tokens, answer_x1s, answer_x2s): - print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", - end="") + print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="") print() sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000")) state = cot_decoding.run( - question= - r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?", + question=r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?", get_top_k=10, is_chat_model=True, verbose=False, diff --git a/examples/usage/json_decode.py b/examples/usage/json_decode.py index ec2323e68..dc34d3527 100644 --- a/examples/usage/json_decode.py +++ b/examples/usage/json_decode.py @@ -3,10 +3,12 @@ Usage: python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python json_decode.py """ + from enum import Enum -import sglang as sgl from pydantic import BaseModel + +import sglang as sgl from sglang.srt.constrained import build_regex_from_object character_regex = ( diff --git a/examples/usage/llava/http_llama3_llava_test.py b/examples/usage/llava/http_llama3_llava_test.py index 113adbc8d..813a26af5 100644 --- a/examples/usage/llava/http_llama3_llava_test.py +++ b/examples/usage/llava/http_llama3_llava_test.py @@ -14,16 +14,13 @@ Output: import argparse import asyncio +import copy import json import time -import copy import aiohttp import requests - -from llava.conversation import ( - conv_llava_llama_3, -) +from llava.conversation import conv_llava_llama_3 async def send_request(url, data, delay=0): diff --git a/examples/usage/llava/http_qwen_llava_test.py b/examples/usage/llava/http_qwen_llava_test.py index 9ba206415..1c29658c6 100644 --- a/examples/usage/llava/http_qwen_llava_test.py +++ b/examples/usage/llava/http_qwen_llava_test.py @@ -14,16 +14,13 @@ Output: import argparse import asyncio +import copy import json import time -import copy import aiohttp import requests - -from llava.conversation import ( - conv_qwen -) +from llava.conversation import conv_qwen async def send_request(url, data, delay=0): diff --git a/examples/usage/llava/srt_llava_next_test.py b/examples/usage/llava/srt_llava_next_test.py index d077fb2a6..0f9621648 100644 --- a/examples/usage/llava/srt_llava_next_test.py +++ b/examples/usage/llava/srt_llava_next_test.py @@ -2,13 +2,15 @@ Usage: python3 srt_example_llava.py """ -import sglang as sgl -from sglang.srt.utils import load_image -from sglang.lang.chat_template import get_chat_template - from PIL import ImageFile + +import sglang as sgl +from sglang.lang.chat_template import get_chat_template +from sglang.srt.utils import load_image + ImageFile.LOAD_TRUNCATED_IMAGES = True # Allow loading of truncated images + @sgl.function def image_qa(s, image, question): s += sgl.user(sgl.image(image) + question) diff --git a/examples/usage/llava_video/srt_example_llava_v.py b/examples/usage/llava_video/srt_example_llava_v.py index e18a81ebb..df771f41b 100644 --- a/examples/usage/llava_video/srt_example_llava_v.py +++ b/examples/usage/llava_video/srt_example_llava_v.py @@ -2,15 +2,17 @@ Usage: python3 srt_example_llava.py """ -import sglang as sgl -import os -import csv -import time import argparse +import csv +import os +import time + +import sglang as sgl + @sgl.function def video_qa(s, num_frames, video_path, question): - s += sgl.user(sgl.video(video_path,num_frames) + question) + s += sgl.user(sgl.video(video_path, num_frames) + question) s += sgl.assistant(sgl.gen("answer")) @@ -25,7 +27,6 @@ def single(path, num_frames=16): print(state["answer"], "\n") - def split_into_chunks(lst, num_chunks): """Split a list into a specified number of chunks.""" # Calculate the chunk size using integer division. Note that this may drop some items if not evenly divisible. @@ -34,7 +35,7 @@ def split_into_chunks(lst, num_chunks): if chunk_size == 0: chunk_size = len(lst) # Use list comprehension to generate chunks. The last chunk will take any remainder if the list size isn't evenly divisible. - chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] + chunks = [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)] # Ensure we have exactly num_chunks chunks, even if some are empty chunks.extend([[] for _ in range(num_chunks - len(chunks))]) return chunks @@ -42,67 +43,73 @@ def split_into_chunks(lst, num_chunks): def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, save_dir): csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv" - with open(csv_filename, 'w', newline='') as csvfile: + with open(csv_filename, "w", newline="") as csvfile: writer = csv.writer(csvfile) - writer.writerow(['video_name', 'answer']) + writer.writerow(["video_name", "answer"]) for video_path, state in zip(batch_video_files, states): video_name = os.path.basename(video_path) writer.writerow([video_name, state["answer"]]) + def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir): final_csv_filename = f"{save_dir}/final_results_chunk_{cur_chunk}.csv" - with open(final_csv_filename, 'w', newline='') as final_csvfile: + with open(final_csv_filename, "w", newline="") as final_csvfile: writer = csv.writer(final_csvfile) - writer.writerow(['video_name', 'answer']) + writer.writerow(["video_name", "answer"]) for batch_idx in range(num_batches): batch_csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv" - with open(batch_csv_filename, 'r') as batch_csvfile: + with open(batch_csv_filename, "r") as batch_csvfile: reader = csv.reader(batch_csvfile) next(reader) # Skip header row for row in reader: writer.writerow(row) os.remove(batch_csv_filename) + def find_video_files(video_dir): # Check if the video_dir is actually a file if os.path.isfile(video_dir): # If it's a file, return it as a single-element list return [video_dir] - + # Original logic to find video files in a directory video_files = [] for root, dirs, files in os.walk(video_dir): for file in files: - if file.endswith(('.mp4', '.avi', '.mov')): + if file.endswith((".mp4", ".avi", ".mov")): video_files.append(os.path.join(root, file)) return video_files + def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=64): video_files = find_video_files(video_dir) chunked_video_files = split_into_chunks(video_files, num_chunks)[cur_chunk] num_batches = 0 for i in range(0, len(chunked_video_files), batch_size): - batch_video_files = chunked_video_files[i:i + batch_size] + batch_video_files = chunked_video_files[i : i + batch_size] print(f"Processing batch of {len(batch_video_files)} video(s)...") if not batch_video_files: print("No video files found in the specified directory.") return - + batch_input = [ - { + { "num_frames": num_frames, "video_path": video_path, "question": "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes.", - } for video_path in batch_video_files + } + for video_path in batch_video_files ] start_time = time.time() states = video_qa.run_batch(batch_input, max_new_tokens=512, temperature=0.2) total_time = time.time() - start_time average_time = total_time / len(batch_video_files) - print(f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds") + print( + f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds" + ) save_batch_results(batch_video_files, states, cur_chunk, num_batches, save_dir) num_batches += 1 @@ -113,16 +120,47 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size= if __name__ == "__main__": # Create the parser - parser = argparse.ArgumentParser(description='Run video processing with specified port.') + parser = argparse.ArgumentParser( + description="Run video processing with specified port." + ) # Add an argument for the port - parser.add_argument('--port', type=int, default=30000, help='The master port for distributed serving.') - parser.add_argument('--chunk-idx', type=int, default=0, help='The index of the chunk to process.') - parser.add_argument('--num-chunks', type=int, default=8, help='The number of chunks to process.') - parser.add_argument('--save-dir', type=str, default="./work_dirs/llava_video", help='The directory to save the processed video files.') - parser.add_argument('--video-dir', type=str, default="./videos/Q98Z4OTh8RwmDonc.mp4", help='The directory or path for the processed video files.') - parser.add_argument('--model-path', type=str, default="lmms-lab/LLaVA-NeXT-Video-7B", help='The model path for the video processing.') - parser.add_argument('--num-frames', type=int, default=16, help='The number of frames to process in each video.' ) + parser.add_argument( + "--port", + type=int, + default=30000, + help="The master port for distributed serving.", + ) + parser.add_argument( + "--chunk-idx", type=int, default=0, help="The index of the chunk to process." + ) + parser.add_argument( + "--num-chunks", type=int, default=8, help="The number of chunks to process." + ) + parser.add_argument( + "--save-dir", + type=str, + default="./work_dirs/llava_video", + help="The directory to save the processed video files.", + ) + parser.add_argument( + "--video-dir", + type=str, + default="./videos/Q98Z4OTh8RwmDonc.mp4", + help="The directory or path for the processed video files.", + ) + parser.add_argument( + "--model-path", + type=str, + default="lmms-lab/LLaVA-NeXT-Video-7B", + help="The model path for the video processing.", + ) + parser.add_argument( + "--num-frames", + type=int, + default=16, + help="The number of frames to process in each video.", + ) parser.add_argument("--mm_spatial_pool_stride", type=int, default=2) # Parse the arguments @@ -154,7 +192,6 @@ if __name__ == "__main__": if "34b" in args.model_path.lower(): model_overide_args["image_token_index"] = 64002 - if args.num_frames == 32: model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"} model_overide_args["max_sequence_length"] = 4096 * 2 @@ -162,22 +199,22 @@ if __name__ == "__main__": elif args.num_frames < 32: pass else: - print("The maximum number of frames to process is 32. Please specify a valid number of frames.") + print( + "The maximum number of frames to process is 32. Please specify a valid number of frames." + ) exit() - runtime = sgl.Runtime( - model_path=args.model_path, #"liuhaotian/llava-v1.6-vicuna-7b", + model_path=args.model_path, # "liuhaotian/llava-v1.6-vicuna-7b", tokenizer_path=tokenizer_path, port=cur_port, - additional_ports=[cur_port+1,cur_port+2,cur_port+3,cur_port+4], + additional_ports=[cur_port + 1, cur_port + 2, cur_port + 3, cur_port + 4], model_overide_args=model_overide_args, - tp_size=1 + tp_size=1, ) sgl.set_default_backend(runtime) print(f"chat template: {runtime.endpoint.chat_template.name}") - # Run a single request # try: print("\n========== single ==========\n") @@ -185,24 +222,29 @@ if __name__ == "__main__": if os.path.isfile(root): video_files = [root] else: - video_files = [os.path.join(root, f) for f in os.listdir(root) if f.endswith(('.mp4', '.avi', '.mov'))] # Add more extensions if needed + video_files = [ + os.path.join(root, f) + for f in os.listdir(root) + if f.endswith((".mp4", ".avi", ".mov")) + ] # Add more extensions if needed start_time = time.time() # Start time for processing a single video for cur_video in video_files[:1]: print(cur_video) single(cur_video, num_frames) end_time = time.time() # End time for processing a single video total_time = end_time - start_time - average_time = total_time / len(video_files) # Calculate the average processing time + average_time = total_time / len( + video_files + ) # Calculate the average processing time print(f"Average processing time per video: {average_time:.2f} seconds") runtime.shutdown() # except Exception as e: # print(e) runtime.shutdown() - # # # Run a batch of requests # print("\n========== batch ==========\n") # if not os.path.exists(args.save_dir): # os.makedirs(args.save_dir) # batch(args.video_dir,args.save_dir,cur_chunk, num_chunks, num_frames, num_chunks) - # runtime.shutdown() \ No newline at end of file + # runtime.shutdown() diff --git a/examples/usage/openai_chat_speculative.py b/examples/usage/openai_chat_speculative.py index 94eb43276..a9c5f5afb 100644 --- a/examples/usage/openai_chat_speculative.py +++ b/examples/usage/openai_chat_speculative.py @@ -15,23 +15,40 @@ incorrect: export OPENAI_API_KEY=sk-****** python3 openai_chat_speculative.py """ + import sglang as sgl -from sglang import function, set_default_backend, OpenAI +from sglang import OpenAI, function, set_default_backend @function(num_api_spec_tokens=256) def gen_character_spec(s): s += sgl.system("You are a helpful assistant.") s += sgl.user("Construct a character within the following format:") - s += sgl.assistant("Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n") + s += sgl.assistant( + "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n" + ) s += sgl.user("Please generate new Name, Birthday and Job.\n") - s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n")) + s += sgl.assistant( + "Name:" + + sgl.gen("name", stop="\n") + + "\nBirthday:" + + sgl.gen("birthday", stop="\n") + + "\nJob:" + + sgl.gen("job", stop="\n") + ) @function(num_api_spec_tokens=256) def gen_character_spec_no_few_shot(s): s += sgl.user("Construct a character. For each field stop with a newline\n") - s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nAge:" + sgl.gen("age", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n")) + s += sgl.assistant( + "Name:" + + sgl.gen("name", stop="\n") + + "\nAge:" + + sgl.gen("age", stop="\n") + + "\nJob:" + + sgl.gen("job", stop="\n") + ) @function @@ -45,10 +62,19 @@ def gen_character_normal(s): def multi_turn_question(s, question_1, question_2): s += sgl.system("You are a helpful assistant.") s += sgl.user("Answer questions in the following format:") - s += sgl.user("Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n") - s += sgl.assistant("Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n") - s += sgl.user("Question 1: " + question_1+"\nQuestion 2: " + question_2) - s += sgl.assistant("Answer 1: " + sgl.gen("answer_1", stop="\n") + "\nAnswer 2: " + sgl.gen("answer_2", stop="\n")) + s += sgl.user( + "Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n" + ) + s += sgl.assistant( + "Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n" + ) + s += sgl.user("Question 1: " + question_1 + "\nQuestion 2: " + question_2) + s += sgl.assistant( + "Answer 1: " + + sgl.gen("answer_1", stop="\n") + + "\nAnswer 2: " + + sgl.gen("answer_2", stop="\n") + ) def test_spec_single_turn(): @@ -97,7 +123,7 @@ def test_spec_multi_turn_stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(): @@ -126,4 +152,4 @@ if __name__ == "__main__": print("\n========== test spec multi turn stream ==========\n") # expect error in stream_executor: stream is not supported... - test_spec_multi_turn_stream() \ No newline at end of file + test_spec_multi_turn_stream() diff --git a/examples/usage/openai_speculative.py b/examples/usage/openai_speculative.py index c64694da6..4389cb059 100644 --- a/examples/usage/openai_speculative.py +++ b/examples/usage/openai_speculative.py @@ -2,7 +2,8 @@ Usage: python3 openai_speculative.py """ -from sglang import function, gen, set_default_backend, OpenAI + +from sglang import OpenAI, function, gen, set_default_backend @function(num_api_spec_tokens=64) @@ -35,7 +36,11 @@ if __name__ == "__main__": backend = OpenAI("gpt-3.5-turbo-instruct") set_default_backend(backend) - for function in [gen_character_spec, gen_character_no_spec, gen_character_spec_no_few_shot]: + for function in [ + gen_character_spec, + gen_character_no_spec, + gen_character_spec_no_few_shot, + ]: backend.token_usage.reset() print(f"function: {function.func.__name__}") @@ -46,4 +51,4 @@ if __name__ == "__main__": print("...birthday:", state["birthday"]) print("...job:", state["job"]) print(backend.token_usage) - print() \ No newline at end of file + print() diff --git a/examples/usage/parallel_sample.py b/examples/usage/parallel_sample.py index 288b48ac0..0f3cf1700 100644 --- a/examples/usage/parallel_sample.py +++ b/examples/usage/parallel_sample.py @@ -2,6 +2,7 @@ Usage: python3 parallel_sample.py """ + import sglang as sgl @@ -12,7 +13,6 @@ def parallel_sample(s, question, n): "Reasoning: I need to use a calculator.\n" "Tool: calculator\n" "Answer: 6\n" - "Question: Compute 3 + 2 + 2\n" "Reasoning: I will try a calculator.\n" "Tool: calculator\n" @@ -27,13 +27,9 @@ def parallel_sample(s, question, n): sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct")) -#sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000")) +# sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000")) -state = parallel_sample.run( - question="Compute 5 + 2 + 4.", - n=5, - temperature=1.0 -) +state = parallel_sample.run(question="Compute 5 + 2 + 4.", n=5, temperature=1.0) for i in range(5): obj = { diff --git a/examples/usage/readme_examples.py b/examples/usage/readme_examples.py index 8789e1b13..7269ef148 100644 --- a/examples/usage/readme_examples.py +++ b/examples/usage/readme_examples.py @@ -3,13 +3,18 @@ Usage: python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python readme_examples.py """ + import sglang as sgl @sgl.function def tool_use(s, question): s += "To answer this question: " + question + ". " - s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"]) + ". " + s += ( + "I need to use a " + + sgl.gen("tool", choices=["calculator", "search engine"]) + + ". " + ) if s["tool"] == "calculator": s += "The math expression is" + sgl.gen("expression") @@ -75,7 +80,7 @@ def driver_batching(): {"question": "What is the capital of France?"}, {"question": "What is the capital of Japan?"}, ], - progress_bar=True + progress_bar=True, ) for s in states: @@ -85,9 +90,7 @@ def driver_batching(): def driver_stream(): state = text_qa.run( - question="What is the capital of France?", - temperature=0.1, - stream=True + question="What is the capital of France?", temperature=0.1, stream=True ) for out in state.text_iter(): @@ -96,7 +99,7 @@ def driver_stream(): if __name__ == "__main__": - #sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct")) + # sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct")) sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000")) driver_tool_use() diff --git a/examples/usage/streaming.py b/examples/usage/streaming.py index 20feaafbc..506ee35c6 100644 --- a/examples/usage/streaming.py +++ b/examples/usage/streaming.py @@ -2,7 +2,9 @@ Usage: python3 streaming.py """ + import asyncio + import sglang as sgl @@ -22,7 +24,7 @@ def stream_a_variable(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) for out in state.text_iter(var_name="answer_2"): @@ -34,7 +36,7 @@ async def async_stream(): state = multi_turn_question.run( question_1="What is the capital of the United States?", question_2="List two local attractions.", - stream=True + stream=True, ) async for out in state.text_async_iter(var_name="answer_2"): diff --git a/examples/usage/triton/models/character_generation/1/model.py b/examples/usage/triton/models/character_generation/1/model.py index e76992f95..5550e9398 100644 --- a/examples/usage/triton/models/character_generation/1/model.py +++ b/examples/usage/triton/models/character_generation/1/model.py @@ -1,45 +1,55 @@ -import triton_python_backend_utils as pb_utils import numpy +import triton_python_backend_utils as pb_utils +from pydantic import BaseModel + import sglang as sgl from sglang import function, set_default_backend from sglang.srt.constrained import build_regex_from_object -from pydantic import BaseModel - sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000")) + class Character(BaseModel): name: str eye_color: str house: str + @function def character_gen(s, name): s += ( name + " is a character in Harry Potter. Please fill in the following information about this character.\n" ) - s += sgl.gen("json_output", max_tokens=256, regex=build_regex_from_object(Character)) + s += sgl.gen( + "json_output", max_tokens=256, regex=build_regex_from_object(Character) + ) class TritonPythonModel: def initialize(self, args): print("Initialized.") + def execute(self, requests): responses = [] for request in requests: tensor_in = pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT") if tensor_in is None: return pb_utils.InferenceResponse(output_tensors=[]) - - input_list_names = [i.decode('utf-8') if isinstance(i, bytes) else i for i in tensor_in.as_numpy().tolist()] - input_list_dicts = [{"name":i} for i in input_list_names] + input_list_names = [ + i.decode("utf-8") if isinstance(i, bytes) else i + for i in tensor_in.as_numpy().tolist() + ] + + input_list_dicts = [{"name": i} for i in input_list_names] states = character_gen.run_batch(input_list_dicts) character_strs = [state.text() for state in states] - tensor_out = pb_utils.Tensor("OUTPUT_TEXT", numpy.array(character_strs, dtype=object)) + tensor_out = pb_utils.Tensor( + "OUTPUT_TEXT", numpy.array(character_strs, dtype=object) + ) - responses.append(pb_utils.InferenceResponse(output_tensors = [tensor_out])) - return responses \ No newline at end of file + responses.append(pb_utils.InferenceResponse(output_tensors=[tensor_out])) + return responses diff --git a/playground/load_tokenizer.py b/playground/load_tokenizer.py index 39fa18424..94cf34bc7 100644 --- a/playground/load_tokenizer.py +++ b/playground/load_tokenizer.py @@ -3,11 +3,12 @@ import code from sglang.srt.hf_transformers_utils import get_tokenizer - if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct") + parser.add_argument( + "--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct" + ) args = parser.parse_args() t = get_tokenizer(args.name) - code.interact(local=locals()) \ No newline at end of file + code.interact(local=locals()) diff --git a/python/sglang/srt/managers/controller/cuda_graph_runner.py b/python/sglang/srt/managers/controller/cuda_graph_runner.py index 7218936be..1be3cfb77 100644 --- a/python/sglang/srt/managers/controller/cuda_graph_runner.py +++ b/python/sglang/srt/managers/controller/cuda_graph_runner.py @@ -183,14 +183,18 @@ class CudaGraphRunner: else: output = LogitProcessorOutput( next_token_logits=output.next_token_logits[:raw_bs], - next_token_logprobs=output.next_token_logprobs[:raw_bs] - if output.next_token_logprobs is not None - else None, + next_token_logprobs=( + output.next_token_logprobs[:raw_bs] + if output.next_token_logprobs is not None + else None + ), normalized_prompt_logprobs=None, prefill_token_logprobs=None, prefill_top_logprobs=None, - decode_top_logprobs=output.decode_top_logprobs[:raw_bs] - if output.decode_top_logprobs is not None - else None, + decode_top_logprobs=( + output.decode_top_logprobs[:raw_bs] + if output.decode_top_logprobs is not None + else None + ), ) return output diff --git a/python/sglang/srt/managers/controller/manager_single.py b/python/sglang/srt/managers/controller/manager_single.py index d4186d484..37af98e9a 100644 --- a/python/sglang/srt/managers/controller/manager_single.py +++ b/python/sglang/srt/managers/controller/manager_single.py @@ -1,7 +1,7 @@ """A controller that manages a group of tensor parallel workers.""" -import multiprocessing import logging +import multiprocessing import os import pickle @@ -11,11 +11,10 @@ import zmq import zmq.asyncio from sglang.srt.managers.controller.tp_worker import ModelTpServer -from sglang.srt.server_args import PortArgs, ServerArgs, ModelPortArgs +from sglang.srt.server_args import ModelPortArgs, PortArgs, ServerArgs from sglang.srt.utils import kill_parent_process from sglang.utils import get_exception_traceback - logger = logging.getLogger("srt.controller") @@ -45,14 +44,16 @@ def run_tp_server( raise -def launch_tp_servers(gpu_ids, tp_rank_range, server_args, - model_port_args, model_overide_args): +def launch_tp_servers( + gpu_ids, tp_rank_range, server_args, model_port_args, model_overide_args +): """Launch multiple tp servers.""" procs = [] for i in tp_rank_range: - proc = multiprocessing.Process(target=run_tp_server, args=( - gpu_ids[i], i, server_args, model_port_args, model_overide_args - )) + proc = multiprocessing.Process( + target=run_tp_server, + args=(gpu_ids[i], i, server_args, model_port_args, model_overide_args), + ) proc.start() procs.append(proc) @@ -93,7 +94,9 @@ def broadcast_recv_input(data, rank, dist_group): class ControllerSingle: """A controller that manages a group of tensor parallel workers.""" - def __init__(self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict): + def __init__( + self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict + ): # Parse args self.server_args = server_args self.tp_procs = [] @@ -116,8 +119,12 @@ class ControllerSingle: if tp_size_local > 1: tp_rank_range = range(1, tp_size_local) self.tp_procs = launch_tp_servers( - gpu_ids, tp_rank_range, server_args, - port_args.model_port_args[0], model_overide_args) + gpu_ids, + tp_rank_range, + server_args, + port_args.model_port_args[0], + model_overide_args, + ) # Launch tp rank 0 self.tp_server = ModelTpServer( diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index 80c40e4f5..ae1f555a1 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -11,7 +11,11 @@ import torch import torch.nn as nn from vllm.config import DeviceConfig, LoadConfig from vllm.config import ModelConfig as VllmModelConfig -from vllm.distributed import init_distributed_environment, initialize_model_parallel, get_tp_group +from vllm.distributed import ( + get_tp_group, + init_distributed_environment, + initialize_model_parallel, +) from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import ModelRegistry @@ -89,9 +93,9 @@ class ModelRunner: # Set some global args global_server_args_dict["disable_flashinfer"] = server_args.disable_flashinfer - global_server_args_dict[ - "attention_reduce_in_fp32" - ] = server_args.attention_reduce_in_fp32 + global_server_args_dict["attention_reduce_in_fp32"] = ( + server_args.attention_reduce_in_fp32 + ) # Load the model and create memory pool self.load_model() diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index c9cd0f3f1..897cab140 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -241,12 +241,9 @@ class ModelTpServer: def print_stats(self): num_used = self.max_total_num_tokens - ( - self.token_to_kv_pool.available_size() - + self.tree_cache.evictable_size() - ) - throughput = self.num_generated_tokens / ( - time.time() - self.last_stats_tic + self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() ) + throughput = self.num_generated_tokens / (time.time() - self.last_stats_tic) self.num_generated_tokens = 0 self.last_stats_tic = time.time() logger.info( @@ -260,8 +257,7 @@ class ModelTpServer: def check_memory(self): available_size = ( - self.token_to_kv_pool.available_size() - + self.tree_cache.evictable_size() + self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() ) if available_size != self.max_total_num_tokens: warnings.warn( @@ -348,7 +344,8 @@ class ModelTpServer: if self.running_batch: available_size -= sum( [ - (r.sampling_params.max_new_tokens - len(r.output_ids)) * self.new_token_ratio + (r.sampling_params.max_new_tokens - len(r.output_ids)) + * self.new_token_ratio for r in self.running_batch.reqs ] ) @@ -370,7 +367,9 @@ class ModelTpServer: req.image_offset += 1 if ( - req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens + req.extend_input_len + + req.sampling_params.max_new_tokens + + new_batch_total_tokens < available_size and ( req.extend_input_len + new_batch_input_tokens @@ -382,7 +381,9 @@ class ModelTpServer: available_size += delta if not ( - req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens + req.extend_input_len + + req.sampling_params.max_new_tokens + + new_batch_total_tokens < available_size ): # Undo locking diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index bd5012904..75af8e62c 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -335,15 +335,16 @@ class TokenizerManager: ) if top_logprobs_num > 0: - ret["meta_info"][ - "prefill_top_logprobs" - ] = self.detokenize_top_logprobs_tokens( - ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs + ret["meta_info"]["prefill_top_logprobs"] = ( + self.detokenize_top_logprobs_tokens( + ret["meta_info"]["prefill_top_logprobs"], + return_text_in_logprobs, + ) ) - ret["meta_info"][ - "decode_top_logprobs" - ] = self.detokenize_top_logprobs_tokens( - ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs + ret["meta_info"]["decode_top_logprobs"] = ( + self.detokenize_top_logprobs_tokens( + ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs + ) ) return ret diff --git a/python/sglang/srt/memory_pool.py b/python/sglang/srt/memory_pool.py index c0a384ccc..28fc512f6 100644 --- a/python/sglang/srt/memory_pool.py +++ b/python/sglang/srt/memory_pool.py @@ -21,7 +21,9 @@ class ReqToTokenPool: if need_size > self.can_use_mem_size: return None - select_index = torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32) + select_index = ( + torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32) + ) self.mem_state[select_index] = False self.can_use_mem_size -= need_size @@ -79,7 +81,9 @@ class TokenToKVPool: addition_size = need_size - buffer_len alloc_size = max(addition_size, self.prefetch_chunk_size) - select_index = torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32) + select_index = ( + torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32) + ) if select_index.shape[0] < addition_size: return None diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index eca15c7cb..e6b3c1d19 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -163,9 +163,9 @@ class LlamaDecoderLayer(nn.Module): if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling[ - "original_max_position_embeddings" - ] = config.original_max_position_embeddings + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings + ) rope_is_neox_style = getattr(config, "rope_is_neox_style", True) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = LlamaAttention( diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 05152c271..8e713cff0 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -313,7 +313,10 @@ class Qwen2ForCausalLM(nn.Module): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - if self.config.tie_word_embeddings and name=="model.embed_tokens.weight": + if ( + self.config.tie_word_embeddings + and name == "model.embed_tokens.weight" + ): weight_loader(params_dict["lm_head.weight"], loaded_weight) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 6e90babcc..ca1f27a63 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -401,9 +401,11 @@ class Qwen2MoeForCausalLM(nn.Module): # These are the weights for the experts # (param_name, weight_name, expert_id, shard_id) ( - "experts.w13_weight" - if weight_name in ["gate_proj", "up_proj"] - else "experts.w2_weight", + ( + "experts.w13_weight" + if weight_name in ["gate_proj", "up_proj"] + else "experts.w2_weight" + ), f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id, @@ -418,7 +420,7 @@ class Qwen2MoeForCausalLM(nn.Module): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: + for param_name, weight_name, shard_id in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: continue diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 30b0e7eec..57862c42c 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -32,8 +32,8 @@ from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.managers.controller.manager_multi import ( start_controller_process as start_controller_process_multi, ) +from sglang.srt.managers.controller.manager_single import launch_tp_servers from sglang.srt.managers.controller.manager_single import ( - launch_tp_servers, start_controller_process as start_controller_process_single, ) from sglang.srt.managers.detokenizer_manager import start_detokenizer_process @@ -198,11 +198,22 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg if server_args.node_rank != 0: tp_size_local = server_args.tp_size // server_args.nnodes - gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)] - tp_rank_range = list(range(server_args.node_rank * tp_size_local, - (server_args.node_rank + 1) * tp_size_local)) - procs = launch_tp_servers(gpu_ids, tp_rank_range, server_args, - port_args.model_port_args[0], model_overide_args) + gpu_ids = [ + i for _ in range(server_args.nnodes) for i in range(tp_size_local) + ] + tp_rank_range = list( + range( + server_args.node_rank * tp_size_local, + (server_args.node_rank + 1) * tp_size_local, + ) + ) + procs = launch_tp_servers( + gpu_ids, + tp_rank_range, + server_args, + port_args.model_port_args[0], + model_overide_args, + ) while True: pass diff --git a/scripts/convert_yi_vl.py b/scripts/convert_yi_vl.py index a45f83a30..bdf37ff92 100644 --- a/scripts/convert_yi_vl.py +++ b/scripts/convert_yi_vl.py @@ -10,16 +10,15 @@ import os from transformers import AutoConfig, AutoTokenizer + def add_image_token(model_path: str): tokenizer = AutoTokenizer.from_pretrained(model_path) - tokenizer.add_tokens( - [""], - special_tokens=True - ) + tokenizer.add_tokens([""], special_tokens=True) print(tokenizer) tokenizer.save_pretrained(model_path) + def edit_model_config(model_path): config = AutoConfig.from_pretrained(model_path) @@ -29,10 +28,11 @@ def edit_model_config(model_path): print(config) config.save_pretrained(model_path) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str) args = parser.parse_args() add_image_token(args.model_path) - edit_model_config(args.model_path) \ No newline at end of file + edit_model_config(args.model_path)