diff --git a/benchmark/latency_throughput/bench_serving.py b/benchmark/latency_throughput/bench_serving.py index d46b84579..cbe63a55b 100644 --- a/benchmark/latency_throughput/bench_serving.py +++ b/benchmark/latency_throughput/bench_serving.py @@ -38,7 +38,6 @@ def sample_requests( num_requests: int, tokenizer: AutoTokenizer, ) -> List[Tuple[str, int, int]]: - def load_dataset(): with open(dataset_path, encoding="utf-8") as f: dataset = json.load(f) diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py index c88ecba49..5763e6615 100644 --- a/benchmark/line_retrieval/gen_data.py +++ b/benchmark/line_retrieval/gen_data.py @@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio): ) for i in redirect_indices: target_idx = np.random.choice(min(i * 2 + 100, num_lines)) - lines[i] = ( - f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}." - ) + lines[ + i + ] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}." redirects[i] = target_idx # Build links and find sources diff --git a/python/sglang/backend/litellm.py b/python/sglang/backend/litellm.py index eef6b0cda..d9b4023ca 100644 --- a/python/sglang/backend/litellm.py +++ b/python/sglang/backend/litellm.py @@ -13,7 +13,6 @@ except ImportError as e: class LiteLLM(BaseBackend): - def __init__( self, model_name, diff --git a/python/sglang/lang/compiler.py b/python/sglang/lang/compiler.py index 2c071e407..36287cd39 100644 --- a/python/sglang/lang/compiler.py +++ b/python/sglang/lang/compiler.py @@ -4,7 +4,7 @@ from queue import Queue from typing import List, Union from sglang.global_config import global_config -from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program +from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program from sglang.lang.ir import ( SglArgument, SglConstantText, @@ -184,7 +184,7 @@ class CompiledFunction: # Extract prefix by tracing and cache it if len(batch_kwargs) > 1: - pin_program(self.function, backend) + cache_program(self.function, backend) # Run all programs if num_threads == "auto": diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py index a048c3dec..b71d8701d 100644 --- a/python/sglang/launch_server_llavavid.py +++ b/python/sglang/launch_server_llavavid.py @@ -6,7 +6,6 @@ import multiprocessing as mp from sglang.srt.server import ServerArgs, launch_server if __name__ == "__main__": - model_overide_args = {} model_overide_args["mm_spatial_pool_stride"] = 2 diff --git a/python/sglang/srt/managers/controller/infer_batch.py b/python/sglang/srt/managers/controller/infer_batch.py index 653225d68..773d6a500 100644 --- a/python/sglang/srt/managers/controller/infer_batch.py +++ b/python/sglang/srt/managers/controller/infer_batch.py @@ -498,9 +498,10 @@ class Batch: req.output_ids = cur_output_ids continue - jump_forward_str, next_state = ( - req.jump_forward_map.jump_forward_symbol(cur_state) - ) + ( + jump_forward_str, + next_state, + ) = req.jump_forward_map.jump_forward_symbol(cur_state) # Make the incrementally decoded text part of jump_forward_str # so that the UTF-8 will not corrupt diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index ba19142da..7ee1e5079 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -283,13 +283,14 @@ class ModelTpServer: (recv_req.image_hash >> 64) % self.model_config.vocab_size, ] req.image_size = recv_req.image_size - req.origin_input_ids, req.image_offset = ( - self.model_runner.model.pad_input_ids( - req.origin_input_ids_unpadded, - req.pad_value, - req.pixel_values.shape, - req.image_size, - ) + ( + req.origin_input_ids, + req.image_offset, + ) = self.model_runner.model.pad_input_ids( + req.origin_input_ids_unpadded, + req.pad_value, + req.pixel_values.shape, + req.image_size, ) req.sampling_params = recv_req.sampling_params req.return_logprob = recv_req.return_logprob diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 20590bc24..e661edfaf 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -35,7 +35,6 @@ class GenerateReqInput: stream: bool = False def post_init(self): - if (self.text is None and self.input_ids is None) or ( self.text is not None and self.input_ids is not None ): diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 42f970370..0d137eb8a 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -334,15 +334,15 @@ class TokenizerManager: ret["meta_info"]["decode_token_logprobs"], return_text_in_logprobs ) if top_logprobs_num > 0: - ret["meta_info"]["prefill_top_logprobs"] = ( - self.detokenize_top_logprobs_tokens( - ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs - ) + ret["meta_info"][ + "prefill_top_logprobs" + ] = self.detokenize_top_logprobs_tokens( + ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs ) - ret["meta_info"]["decode_top_logprobs"] = ( - self.detokenize_top_logprobs_tokens( - ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs - ) + ret["meta_info"][ + "decode_top_logprobs" + ] = self.detokenize_top_logprobs_tokens( + ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs ) return ret diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index a15cc3d4c..e9ec3e2d2 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -36,7 +36,6 @@ LoraConfig = None class GLMAttention(nn.Module): - def __init__( self, config, @@ -294,7 +293,6 @@ class GLMTransformer(nn.Module): class ChatGLMModel(nn.Module): - def __init__( self, config, diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 9cae0b105..cbf29055c 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -521,7 +521,6 @@ class Grok1DecoderLayer(nn.Module): hidden_states: torch.Tensor, input_metadata: InputMetadata, ) -> torch.Tensor: - hidden_states = ( self.post_attn_norm( self.self_attn( diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index 051036525..e60b036bd 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -160,9 +160,9 @@ class LlamaDecoderLayer(nn.Module): if rope_scaling is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + rope_scaling[ + "original_max_position_embeddings" + ] = config.original_max_position_embeddings max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = LlamaAttention( hidden_size=self.hidden_size,