diff --git a/python/sglang/api.py b/python/sglang/api.py index de36287c2..80cf080a9 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -1,4 +1,5 @@ """Public API""" + import re from typing import Callable, List, Optional, Union diff --git a/python/sglang/backend/runtime_endpoint.py b/python/sglang/backend/runtime_endpoint.py index 21e3f8c46..bf5a58197 100644 --- a/python/sglang/backend/runtime_endpoint.py +++ b/python/sglang/backend/runtime_endpoint.py @@ -19,7 +19,9 @@ class RuntimeEndpoint(BaseBackend): self.base_url = base_url self.auth_token = auth_token - res = http_request(self.base_url + "/get_model_info", auth_token=self.auth_token) + res = http_request( + self.base_url + "/get_model_info", auth_token=self.auth_token + ) assert res.status_code == 200 self.model_info = res.json() @@ -37,7 +39,7 @@ class RuntimeEndpoint(BaseBackend): res = http_request( self.base_url + "/generate", json={"text": prefix_str, "sampling_params": {"max_new_tokens": 0}}, - auth_token=self.auth_token + auth_token=self.auth_token, ) assert res.status_code == 200 @@ -45,14 +47,16 @@ class RuntimeEndpoint(BaseBackend): res = http_request( self.base_url + "/generate", json={"text": s.text_, "sampling_params": {"max_new_tokens": 0}}, - auth_token=self.auth_token + auth_token=self.auth_token, ) assert res.status_code == 200 def fill_image(self, s: StreamExecutor): data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}} self._add_images(s, data) - res = http_request(self.base_url + "/generate", json=data, auth_token=self.auth_token) + res = http_request( + self.base_url + "/generate", json=data, auth_token=self.auth_token + ) assert res.status_code == 200 def generate( @@ -82,7 +86,9 @@ class RuntimeEndpoint(BaseBackend): self._add_images(s, data) - res = http_request(self.base_url + "/generate", json=data, auth_token=self.auth_token) + res = http_request( + self.base_url + "/generate", json=data, auth_token=self.auth_token + ) obj = res.json() comp = obj["text"] return comp, obj["meta_info"] @@ -115,7 +121,12 @@ class RuntimeEndpoint(BaseBackend): data["stream"] = True self._add_images(s, data) - response = http_request(self.base_url + "/generate", json=data, stream=True, auth_token=self.auth_token) + response = http_request( + self.base_url + "/generate", + json=data, + stream=True, + auth_token=self.auth_token, + ) pos = 0 incomplete_text = "" @@ -145,7 +156,9 @@ class RuntimeEndpoint(BaseBackend): # Cache common prefix data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}} self._add_images(s, data) - res = http_request(self.base_url + "/generate", json=data, auth_token=self.auth_token) + res = http_request( + self.base_url + "/generate", json=data, auth_token=self.auth_token + ) assert res.status_code == 200 prompt_len = res.json()["meta_info"]["prompt_tokens"] @@ -157,7 +170,9 @@ class RuntimeEndpoint(BaseBackend): "logprob_start_len": max(prompt_len - 2, 0), } self._add_images(s, data) - res = http_request(self.base_url + "/generate", json=data, auth_token=self.auth_token) + res = http_request( + self.base_url + "/generate", json=data, auth_token=self.auth_token + ) assert res.status_code == 200 obj = res.json() normalized_prompt_logprob = [ @@ -172,7 +187,7 @@ class RuntimeEndpoint(BaseBackend): res = http_request( self.base_url + "/concate_and_append_request", json={"src_rids": src_rids, "dst_rid": dst_rid}, - auth_token=self.auth_token + auth_token=self.auth_token, ) assert res.status_code == 200 diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py index 3b89fa5b0..8e49fe7cf 100644 --- a/python/sglang/lang/chat_template.py +++ b/python/sglang/lang/chat_template.py @@ -116,6 +116,21 @@ register_chat_template( ) +register_chat_template( + ChatTemplate( + name="chatml-llava", + default_system_prompt="Answer the questions.", + role_prefix_and_suffix={ + "system": ("<|im_start|>system\n", "\n<|im_end|>\n"), + "user": ("<|im_start|>user\n", "\n<|im_end|>\n"), + "assistant": ("<|im_start|>assistant\n", "\n<|im_end|>\n"), + }, + style=ChatTemplateStyle.PLAIN, + stop_str=("<|im_end|>",), + image_token=" \n", + ) +) + register_chat_template( ChatTemplate( name="vicuna_v1.1", @@ -168,7 +183,7 @@ register_chat_template( def match_vicuna(model_path: str): if "vicuna" in model_path.lower(): return get_chat_template("vicuna_v1.1") - if "llava" in model_path.lower(): + if "llava-v1.5" in model_path.lower(): return get_chat_template("vicuna_v1.1") @@ -192,6 +207,8 @@ def match_chat_ml(model_path: str): return get_chat_template("chatml") if "qwen" in model_path and "chat" in model_path: return get_chat_template("chatml") + if "llava-v1.6-34b" in model_path: + return get_chat_template("chatml-llava") @register_chat_template_matching_function diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 85f6f3f29..15b06ec69 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -74,9 +74,9 @@ class SglSamplingParams: ) return { "max_tokens_to_sample": self.max_new_tokens, - "stop_sequences": self.stop - if isinstance(self.stop, (list, tuple)) - else [self.stop], + "stop_sequences": ( + self.stop if isinstance(self.stop, (list, tuple)) else [self.stop] + ), "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, diff --git a/python/sglang/lang/tracer.py b/python/sglang/lang/tracer.py index 4b6704233..74ac9b998 100644 --- a/python/sglang/lang/tracer.py +++ b/python/sglang/lang/tracer.py @@ -1,4 +1,5 @@ """Tracing a program.""" + import uuid from typing import Any, Callable, Dict, List, Optional, Union diff --git a/python/sglang/srt/backend_config.py b/python/sglang/srt/backend_config.py index b29f08387..107ae0ecb 100644 --- a/python/sglang/srt/backend_config.py +++ b/python/sglang/srt/backend_config.py @@ -1,6 +1,7 @@ """ Backend configurations, may vary with different serving platforms. """ + from dataclasses import dataclass diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py index df872f77a..0d384d3dc 100644 --- a/python/sglang/srt/conversation.py +++ b/python/sglang/srt/conversation.py @@ -366,7 +366,8 @@ def generate_chat_conv( if content.type == "text": real_content += content.text elif content.type == "image_url": - real_content += "" + # NOTE: Only works for llava + real_content += "\n" conv.append_image(content.image_url.url) conv.append_message(conv.roles[0], real_content) elif msg_role == "assistant": diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index b57f6ce52..1db787603 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -31,6 +31,7 @@ from sglang.srt.utils import ( is_multimodal_model, set_random_seed, ) +from vllm.logger import _default_handler as vllm_default_handler logger = logging.getLogger("model_rpc") @@ -50,6 +51,9 @@ class ModelRpcServer(rpyc.Service): self.tp_size = server_args.tp_size self.schedule_heuristic = server_args.schedule_heuristic self.disable_regex_jump_forward = server_args.disable_regex_jump_forward + vllm_default_handler.setLevel( + level=getattr(logging, server_args.log_level.upper()) + ) # Init model and tokenizer self.model_config = ModelConfig( @@ -83,9 +87,11 @@ class ModelRpcServer(rpyc.Service): self.max_num_running_seq = self.max_total_num_token // 2 self.max_prefill_num_token = max( self.model_config.context_len, - self.max_total_num_token // 6 - if server_args.max_prefill_num_token is None - else server_args.max_prefill_num_token, + ( + self.max_total_num_token // 6 + if server_args.max_prefill_num_token is None + else server_args.max_prefill_num_token + ), ) self.int_token_logit_bias = torch.tensor( get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size) @@ -534,7 +540,7 @@ class ModelRpcServer(rpyc.Service): output_skip_special_tokens.append( req.sampling_params.skip_special_tokens ) - + # For the length of input_ids, which will be accumulated during jump-forward. # Use the original length of input_ids to calculate the token usage info. meta_info = { diff --git a/python/sglang/srt/managers/router/model_runner.py b/python/sglang/srt/managers/router/model_runner.py index 8fffbd958..7e4a393f1 100644 --- a/python/sglang/srt/managers/router/model_runner.py +++ b/python/sglang/srt/managers/router/model_runner.py @@ -112,7 +112,9 @@ class InputMetadata: (self.batch_size,), dtype=torch.int32, device="cuda" ) - workspace_buffer = torch.empty(32 * 1024 * 1024, dtype=torch.int8, device="cuda") + workspace_buffer = torch.empty( + 32 * 1024 * 1024, dtype=torch.int8, device="cuda" + ) if ( self.forward_mode == ForwardMode.PREFILL or self.forward_mode == ForwardMode.EXTEND @@ -121,7 +123,9 @@ class InputMetadata: (self.batch_size + 1,), dtype=torch.int32, device="cuda" ) self.qo_indptr[1:] = torch.cumsum(self.extend_seq_lens, dim=0) - self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD") + self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, "NHD" + ) self.prefill_wrapper.begin_forward( self.qo_indptr, self.kv_indptr, @@ -131,7 +135,9 @@ class InputMetadata: self.model_runner.model_config.num_key_value_heads // tp_size, ) else: - self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD") + self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( + workspace_buffer, "NHD" + ) self.decode_wrapper.begin_forward( self.kv_indptr, self.kv_indices, diff --git a/python/sglang/srt/memory_pool.py b/python/sglang/srt/memory_pool.py index 99f86d7b0..c93cb6044 100644 --- a/python/sglang/srt/memory_pool.py +++ b/python/sglang/srt/memory_pool.py @@ -1,4 +1,5 @@ """Memory pool.""" + import logging import torch diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index d6213812e..c35b9c7eb 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -1,4 +1,5 @@ """Inference-only LLaVa model compatible with HuggingFace weights.""" + from typing import List, Optional import numpy as np @@ -269,7 +270,6 @@ class LlavaLlamaForCausalLM(nn.Module): raise ValueError(f"Unexpected select feature: {self.select_feature}") # load mm_projector - # TODO: support TP? projector_weights = { "model.mm_projector.0": "multi_modal_projector.linear_1", "model.mm_projector.2": "multi_modal_projector.linear_2", diff --git a/python/sglang/srt/models/mistral.py b/python/sglang/srt/models/mistral.py index 497e1c266..54794e6fc 100644 --- a/python/sglang/srt/models/mistral.py +++ b/python/sglang/srt/models/mistral.py @@ -1,4 +1,5 @@ """Inference-only Mistral model.""" + from sglang.srt.models.llama2 import LlamaForCausalLM diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 12185ff9e..ea9e99bb0 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -97,14 +97,16 @@ class MixtralMoE(nn.Module): self.experts = nn.ModuleList( [ - MixtralMLP( - self.num_total_experts, - config.hidden_size, - config.intermediate_size, - linear_method=linear_method, + ( + MixtralMLP( + self.num_total_experts, + config.hidden_size, + config.intermediate_size, + linear_method=linear_method, + ) + if idx in self.expert_indicies + else None ) - if idx in self.expert_indicies - else None for idx in range(self.num_total_experts) ] ) diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py index 0d7a29dc7..014e40c6e 100644 --- a/python/sglang/srt/models/yivl.py +++ b/python/sglang/srt/models/yivl.py @@ -1,4 +1,5 @@ """Inference-only Yi-VL model.""" + import os from typing import List, Optional diff --git a/python/sglang/srt/sampling_params.py b/python/sglang/srt/sampling_params.py index 89b89d57d..f096de2d9 100644 --- a/python/sglang/srt/sampling_params.py +++ b/python/sglang/srt/sampling_params.py @@ -1,4 +1,5 @@ """Sampling parameters for text generation.""" + from typing import List, Optional, Union _SAMPLING_EPS = 1e-6 diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index e5b066769..e72359374 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -1,4 +1,5 @@ """SRT: SGLang Runtime""" + import asyncio import json import multiprocessing as mp @@ -493,7 +494,7 @@ def launch_server(server_args, pipe_finish_writer): # Warmup try: - print("Warmup...", flush=True) + # print("Warmup...", flush=True) res = requests.post( url + "/generate", json={ @@ -505,8 +506,8 @@ def launch_server(server_args, pipe_finish_writer): }, timeout=60, ) - print(f"Warmup done. model response: {res.json()['text']}") - print("=" * 20, "Server is ready", "=" * 20, flush=True) + # print(f"Warmup done. model response: {res.json()['text']}") + # print("=" * 20, "Server is ready", "=" * 20, flush=True) except requests.exceptions.RequestException as e: if pipe_finish_writer is not None: pipe_finish_writer.send(str(e)) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 54274f366..7bd8aaafe 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -122,7 +122,7 @@ def handle_port_init( # first check on server port if not check_port(port): new_port = alloc_usable_network_port(1, used_list=[port])[0] - print(f"Port {port} is not available, using {new_port} instead.") + print(f"WARNING: Port {port} is not available. Use {new_port} instead.") port = new_port # then we check on additional ports @@ -157,8 +157,6 @@ def get_int_token_logit_bias(tokenizer, vocab_size): ss = tokenizer.decode([t_id]).strip() if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id): logit_bias[t_id] = -1e5 - # else: - # print(ss, t_id) return logit_bias diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 602aa7c59..80d25e1a8 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -1,4 +1,5 @@ """Common utilities for testing and benchmarking""" + import numpy as np import requests from sglang.backend.openai import OpenAI diff --git a/python/sglang/utils.py b/python/sglang/utils.py index a2cd39ee6..3c55797fc 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -22,7 +22,7 @@ def get_available_gpu_memory(gpu_id, distributed=True): if torch.cuda.current_device() != gpu_id: print( - f"WARN: current device is not {gpu_id}, but {torch.cuda.current_device()}, ", + f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ", "which may cause useless memory allocation for torch CUDA context.", ) @@ -95,7 +95,7 @@ def http_request(url, json=None, stream=False, auth_token=None): return requests.post(url, json=json, stream=True) headers = { "Content-Type": "application/json", - "Authentication": f"Bearer {auth_token}" + "Authentication": f"Bearer {auth_token}", } return requests.post(url, json=json, stream=True, headers=headers) else: diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index 102f57f24..82a9f1ad4 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -1,6 +1,7 @@ """ python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 """ + import json import unittest diff --git a/test/srt/model/bench_llama_low_api.py b/test/srt/model/bench_llama_low_api.py index 3e3534709..cd7c94c27 100644 --- a/test/srt/model/bench_llama_low_api.py +++ b/test/srt/model/bench_llama_low_api.py @@ -66,9 +66,9 @@ class BenchBatch: p_idx = prefix_req_idx[i // fork_num].item() n_idx = self.req_pool_indices[i].item() req_to_token[n_idx, :prefix_len] = req_to_token[p_idx, :prefix_len] - req_to_token[ - n_idx, prefix_len : prefix_len + extend_len - ] = self.out_cache_loc[i * extend_len : (i + 1) * extend_len] + req_to_token[n_idx, prefix_len : prefix_len + extend_len] = ( + self.out_cache_loc[i * extend_len : (i + 1) * extend_len] + ) def update_decode(self, predict_ids, batch_size): assert predict_ids.shape[0] == batch_size @@ -81,9 +81,9 @@ class BenchBatch: self.out_cache_cont_start, self.out_cache_cont_end, ) = self.token_to_kv_pool.alloc_contiguous(batch_size) - self.req_to_token_pool.req_to_token[ - self.req_pool_indices, self.seq_lens - ] = self.out_cache_loc + self.req_to_token_pool.req_to_token[self.req_pool_indices, self.seq_lens] = ( + self.out_cache_loc + ) self.seq_lens.add_(1) diff --git a/test/srt/test_jump_forward.py b/test/srt/test_jump_forward.py index af796a5d1..15ec2caff 100644 --- a/test/srt/test_jump_forward.py +++ b/test/srt/test_jump_forward.py @@ -7,6 +7,7 @@ from sglang.test.test_utils import ( add_common_sglang_args_and_parse, select_sglang_backend, ) + import sglang as sgl IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)" diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 2bff16960..377aa01ce 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -155,7 +155,8 @@ def test_chat_completion_stream(args): def test_regex(args): client = openai.Client(api_key="EMPTY", base_url=args.base_url) - regex = (r"""\{\n""" + regex = ( + r"""\{\n""" + r""" "name": "[\w]+",\n""" + r""" "population": "[\w\d\s]+"\n""" + r"""\}"""