diff --git a/python/sglang/srt/managers/router/infer_batch.py b/python/sglang/srt/managers/router/infer_batch.py index 3a9b88555..0c49f7d86 100644 --- a/python/sglang/srt/managers/router/infer_batch.py +++ b/python/sglang/srt/managers/router/infer_batch.py @@ -63,7 +63,9 @@ class Req: # FIXME: This logic does not really solve the problem of determining whether # there should be a leading space. first_token = self.tokenizer.convert_ids_to_tokens(self.output_ids[0]) - first_token = first_token.decode() if isinstance(first_token, bytes) else first_token + first_token = ( + first_token.decode() if isinstance(first_token, bytes) else first_token + ) if first_token.startswith("▁"): old_output_str = " " + old_output_str new_input_string = ( diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index 88ba48949..869c7e20a 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -344,9 +344,13 @@ class ModelRpcServer(rpyc.Service): return None if self.tp_rank == 0: - running_req = 0 if self.running_batch is None else len(self.running_batch.reqs) + running_req = ( + 0 if self.running_batch is None else len(self.running_batch.reqs) + ) hit_tokens = sum(len(x.prefix_indices) for x in can_run_list) - self.tree_cache_metrics["total"] += (hit_tokens + new_batch_input_tokens) / 10**9 + self.tree_cache_metrics["total"] += ( + hit_tokens + new_batch_input_tokens + ) / 10**9 self.tree_cache_metrics["hit"] += hit_tokens / 10**9 tree_cache_hit_rate = ( self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] @@ -584,7 +588,7 @@ def start_model_process(port): t = ThreadedServer( ModelRpcServer(), port=port, - protocol_config={"allow_pickle": True, "sync_request_timeout": 600}, + protocol_config={"allow_pickle": True, "sync_request_timeout": 1800}, ) t.start() @@ -598,7 +602,7 @@ def start_model_process(port): con = rpyc.connect( "localhost", port, - config={"allow_pickle": True, "sync_request_timeout": 600}, + config={"allow_pickle": True, "sync_request_timeout": 1800}, ) break except ConnectionRefusedError: diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 739097330..12185ff9e 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -351,7 +351,11 @@ class MixtralForCausalLM(nn.Module): params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision + model_name_or_path, + cache_dir, + load_format, + revision, + fall_back_to_pt=False, ): if "rotary_emb.inv_freq" in name: continue