misc: add pre-commit config (#637)
This commit is contained in:
@@ -183,14 +183,18 @@ class CudaGraphRunner:
|
||||
else:
|
||||
output = LogitProcessorOutput(
|
||||
next_token_logits=output.next_token_logits[:raw_bs],
|
||||
next_token_logprobs=output.next_token_logprobs[:raw_bs]
|
||||
if output.next_token_logprobs is not None
|
||||
else None,
|
||||
next_token_logprobs=(
|
||||
output.next_token_logprobs[:raw_bs]
|
||||
if output.next_token_logprobs is not None
|
||||
else None
|
||||
),
|
||||
normalized_prompt_logprobs=None,
|
||||
prefill_token_logprobs=None,
|
||||
prefill_top_logprobs=None,
|
||||
decode_top_logprobs=output.decode_top_logprobs[:raw_bs]
|
||||
if output.decode_top_logprobs is not None
|
||||
else None,
|
||||
decode_top_logprobs=(
|
||||
output.decode_top_logprobs[:raw_bs]
|
||||
if output.decode_top_logprobs is not None
|
||||
else None
|
||||
),
|
||||
)
|
||||
return output
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""A controller that manages a group of tensor parallel workers."""
|
||||
|
||||
import multiprocessing
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import pickle
|
||||
|
||||
@@ -11,11 +11,10 @@ import zmq
|
||||
import zmq.asyncio
|
||||
|
||||
from sglang.srt.managers.controller.tp_worker import ModelTpServer
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs, ModelPortArgs
|
||||
from sglang.srt.server_args import ModelPortArgs, PortArgs, ServerArgs
|
||||
from sglang.srt.utils import kill_parent_process
|
||||
from sglang.utils import get_exception_traceback
|
||||
|
||||
|
||||
logger = logging.getLogger("srt.controller")
|
||||
|
||||
|
||||
@@ -45,14 +44,16 @@ def run_tp_server(
|
||||
raise
|
||||
|
||||
|
||||
def launch_tp_servers(gpu_ids, tp_rank_range, server_args,
|
||||
model_port_args, model_overide_args):
|
||||
def launch_tp_servers(
|
||||
gpu_ids, tp_rank_range, server_args, model_port_args, model_overide_args
|
||||
):
|
||||
"""Launch multiple tp servers."""
|
||||
procs = []
|
||||
for i in tp_rank_range:
|
||||
proc = multiprocessing.Process(target=run_tp_server, args=(
|
||||
gpu_ids[i], i, server_args, model_port_args, model_overide_args
|
||||
))
|
||||
proc = multiprocessing.Process(
|
||||
target=run_tp_server,
|
||||
args=(gpu_ids[i], i, server_args, model_port_args, model_overide_args),
|
||||
)
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
|
||||
@@ -93,7 +94,9 @@ def broadcast_recv_input(data, rank, dist_group):
|
||||
class ControllerSingle:
|
||||
"""A controller that manages a group of tensor parallel workers."""
|
||||
|
||||
def __init__(self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict):
|
||||
def __init__(
|
||||
self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict
|
||||
):
|
||||
# Parse args
|
||||
self.server_args = server_args
|
||||
self.tp_procs = []
|
||||
@@ -116,8 +119,12 @@ class ControllerSingle:
|
||||
if tp_size_local > 1:
|
||||
tp_rank_range = range(1, tp_size_local)
|
||||
self.tp_procs = launch_tp_servers(
|
||||
gpu_ids, tp_rank_range, server_args,
|
||||
port_args.model_port_args[0], model_overide_args)
|
||||
gpu_ids,
|
||||
tp_rank_range,
|
||||
server_args,
|
||||
port_args.model_port_args[0],
|
||||
model_overide_args,
|
||||
)
|
||||
|
||||
# Launch tp rank 0
|
||||
self.tp_server = ModelTpServer(
|
||||
|
||||
@@ -11,7 +11,11 @@ import torch
|
||||
import torch.nn as nn
|
||||
from vllm.config import DeviceConfig, LoadConfig
|
||||
from vllm.config import ModelConfig as VllmModelConfig
|
||||
from vllm.distributed import init_distributed_environment, initialize_model_parallel, get_tp_group
|
||||
from vllm.distributed import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
|
||||
@@ -89,9 +93,9 @@ class ModelRunner:
|
||||
|
||||
# Set some global args
|
||||
global_server_args_dict["disable_flashinfer"] = server_args.disable_flashinfer
|
||||
global_server_args_dict[
|
||||
"attention_reduce_in_fp32"
|
||||
] = server_args.attention_reduce_in_fp32
|
||||
global_server_args_dict["attention_reduce_in_fp32"] = (
|
||||
server_args.attention_reduce_in_fp32
|
||||
)
|
||||
|
||||
# Load the model and create memory pool
|
||||
self.load_model()
|
||||
|
||||
@@ -241,12 +241,9 @@ class ModelTpServer:
|
||||
|
||||
def print_stats(self):
|
||||
num_used = self.max_total_num_tokens - (
|
||||
self.token_to_kv_pool.available_size()
|
||||
+ self.tree_cache.evictable_size()
|
||||
)
|
||||
throughput = self.num_generated_tokens / (
|
||||
time.time() - self.last_stats_tic
|
||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||
)
|
||||
throughput = self.num_generated_tokens / (time.time() - self.last_stats_tic)
|
||||
self.num_generated_tokens = 0
|
||||
self.last_stats_tic = time.time()
|
||||
logger.info(
|
||||
@@ -260,8 +257,7 @@ class ModelTpServer:
|
||||
|
||||
def check_memory(self):
|
||||
available_size = (
|
||||
self.token_to_kv_pool.available_size()
|
||||
+ self.tree_cache.evictable_size()
|
||||
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
||||
)
|
||||
if available_size != self.max_total_num_tokens:
|
||||
warnings.warn(
|
||||
@@ -348,7 +344,8 @@ class ModelTpServer:
|
||||
if self.running_batch:
|
||||
available_size -= sum(
|
||||
[
|
||||
(r.sampling_params.max_new_tokens - len(r.output_ids)) * self.new_token_ratio
|
||||
(r.sampling_params.max_new_tokens - len(r.output_ids))
|
||||
* self.new_token_ratio
|
||||
for r in self.running_batch.reqs
|
||||
]
|
||||
)
|
||||
@@ -370,7 +367,9 @@ class ModelTpServer:
|
||||
req.image_offset += 1
|
||||
|
||||
if (
|
||||
req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens
|
||||
req.extend_input_len
|
||||
+ req.sampling_params.max_new_tokens
|
||||
+ new_batch_total_tokens
|
||||
< available_size
|
||||
and (
|
||||
req.extend_input_len + new_batch_input_tokens
|
||||
@@ -382,7 +381,9 @@ class ModelTpServer:
|
||||
available_size += delta
|
||||
|
||||
if not (
|
||||
req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens
|
||||
req.extend_input_len
|
||||
+ req.sampling_params.max_new_tokens
|
||||
+ new_batch_total_tokens
|
||||
< available_size
|
||||
):
|
||||
# Undo locking
|
||||
|
||||
@@ -335,15 +335,16 @@ class TokenizerManager:
|
||||
)
|
||||
|
||||
if top_logprobs_num > 0:
|
||||
ret["meta_info"][
|
||||
"prefill_top_logprobs"
|
||||
] = self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
|
||||
ret["meta_info"]["prefill_top_logprobs"] = (
|
||||
self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["prefill_top_logprobs"],
|
||||
return_text_in_logprobs,
|
||||
)
|
||||
)
|
||||
ret["meta_info"][
|
||||
"decode_top_logprobs"
|
||||
] = self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
|
||||
ret["meta_info"]["decode_top_logprobs"] = (
|
||||
self.detokenize_top_logprobs_tokens(
|
||||
ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
|
||||
)
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
@@ -21,7 +21,9 @@ class ReqToTokenPool:
|
||||
if need_size > self.can_use_mem_size:
|
||||
return None
|
||||
|
||||
select_index = torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
|
||||
select_index = (
|
||||
torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
|
||||
)
|
||||
self.mem_state[select_index] = False
|
||||
self.can_use_mem_size -= need_size
|
||||
|
||||
@@ -79,7 +81,9 @@ class TokenToKVPool:
|
||||
|
||||
addition_size = need_size - buffer_len
|
||||
alloc_size = max(addition_size, self.prefetch_chunk_size)
|
||||
select_index = torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
|
||||
select_index = (
|
||||
torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
|
||||
)
|
||||
|
||||
if select_index.shape[0] < addition_size:
|
||||
return None
|
||||
|
||||
@@ -163,9 +163,9 @@ class LlamaDecoderLayer(nn.Module):
|
||||
if rope_scaling is not None and getattr(
|
||||
config, "original_max_position_embeddings", None
|
||||
):
|
||||
rope_scaling[
|
||||
"original_max_position_embeddings"
|
||||
] = config.original_max_position_embeddings
|
||||
rope_scaling["original_max_position_embeddings"] = (
|
||||
config.original_max_position_embeddings
|
||||
)
|
||||
rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.self_attn = LlamaAttention(
|
||||
|
||||
@@ -313,7 +313,10 @@ class Qwen2ForCausalLM(nn.Module):
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
if self.config.tie_word_embeddings and name=="model.embed_tokens.weight":
|
||||
if (
|
||||
self.config.tie_word_embeddings
|
||||
and name == "model.embed_tokens.weight"
|
||||
):
|
||||
weight_loader(params_dict["lm_head.weight"], loaded_weight)
|
||||
|
||||
|
||||
|
||||
@@ -401,9 +401,11 @@ class Qwen2MoeForCausalLM(nn.Module):
|
||||
# These are the weights for the experts
|
||||
# (param_name, weight_name, expert_id, shard_id)
|
||||
(
|
||||
"experts.w13_weight"
|
||||
if weight_name in ["gate_proj", "up_proj"]
|
||||
else "experts.w2_weight",
|
||||
(
|
||||
"experts.w13_weight"
|
||||
if weight_name in ["gate_proj", "up_proj"]
|
||||
else "experts.w2_weight"
|
||||
),
|
||||
f"experts.{expert_id}.{weight_name}.weight",
|
||||
expert_id,
|
||||
shard_id,
|
||||
@@ -418,7 +420,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||
# Skip non-stacked layers and experts (experts handled below).
|
||||
if weight_name not in name:
|
||||
continue
|
||||
|
||||
@@ -32,8 +32,8 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.managers.controller.manager_multi import (
|
||||
start_controller_process as start_controller_process_multi,
|
||||
)
|
||||
from sglang.srt.managers.controller.manager_single import launch_tp_servers
|
||||
from sglang.srt.managers.controller.manager_single import (
|
||||
launch_tp_servers,
|
||||
start_controller_process as start_controller_process_single,
|
||||
)
|
||||
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
|
||||
@@ -198,11 +198,22 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
|
||||
|
||||
if server_args.node_rank != 0:
|
||||
tp_size_local = server_args.tp_size // server_args.nnodes
|
||||
gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)]
|
||||
tp_rank_range = list(range(server_args.node_rank * tp_size_local,
|
||||
(server_args.node_rank + 1) * tp_size_local))
|
||||
procs = launch_tp_servers(gpu_ids, tp_rank_range, server_args,
|
||||
port_args.model_port_args[0], model_overide_args)
|
||||
gpu_ids = [
|
||||
i for _ in range(server_args.nnodes) for i in range(tp_size_local)
|
||||
]
|
||||
tp_rank_range = list(
|
||||
range(
|
||||
server_args.node_rank * tp_size_local,
|
||||
(server_args.node_rank + 1) * tp_size_local,
|
||||
)
|
||||
)
|
||||
procs = launch_tp_servers(
|
||||
gpu_ids,
|
||||
tp_rank_range,
|
||||
server_args,
|
||||
port_args.model_port_args[0],
|
||||
model_overide_args,
|
||||
)
|
||||
while True:
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user