Clean up server_args, triton cache manager (#8332)
This commit is contained in:
@@ -653,6 +653,9 @@ class Scheduler(
|
||||
)
|
||||
)
|
||||
|
||||
embedding_cache_size = int(os.environ.get("SGLANG_VLM_CACHE_SIZE_MB", "100"))
|
||||
init_embedding_cache(embedding_cache_size * 1024 * 1024)
|
||||
|
||||
def init_profier(self):
|
||||
self.torch_profiler = None
|
||||
self.torch_profiler_output_dir: Optional[str] = None
|
||||
@@ -2895,9 +2898,9 @@ def run_scheduler_process(
|
||||
prefix += f" PP{pp_rank}"
|
||||
|
||||
# Config the process
|
||||
kill_itself_when_parent_died()
|
||||
setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
|
||||
faulthandler.enable()
|
||||
kill_itself_when_parent_died()
|
||||
parent_process = psutil.Process().parent()
|
||||
|
||||
# [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
|
||||
@@ -2912,10 +2915,6 @@ def run_scheduler_process(
|
||||
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
||||
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
|
||||
|
||||
embedding_cache_size = 100
|
||||
if "SGLANG_VLM_CACHE_SIZE_MB" in os.environ:
|
||||
embedding_cache_size = int(os.environ["SGLANG_VLM_CACHE_SIZE_MB"])
|
||||
init_embedding_cache(embedding_cache_size * 1024 * 1024)
|
||||
# Create a scheduler and run the event loop
|
||||
try:
|
||||
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
|
||||
@@ -2926,8 +2925,8 @@ def run_scheduler_process(
|
||||
"max_req_input_len": scheduler.max_req_input_len,
|
||||
}
|
||||
)
|
||||
disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
|
||||
|
||||
disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
|
||||
if disaggregation_mode == DisaggregationMode.NULL:
|
||||
if server_args.pp_size > 1:
|
||||
scheduler.event_loop_pp()
|
||||
|
||||
Reference in New Issue
Block a user