[Minor] Fix logger and style (#2325)
This commit is contained in:
@@ -163,7 +163,6 @@ async def async_request_openai_completions(
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"stream": not args.disable_stream,
|
||||
"ignore_eos": not args.disable_ignore_eos,
|
||||
"lora_path": request_func_input.lora_name,
|
||||
**request_func_input.extra_request_body,
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
import gc
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
@@ -129,7 +130,7 @@ class ModelRunner:
|
||||
# Global vars
|
||||
if server_args.show_time_cost:
|
||||
enable_show_time_cost()
|
||||
if server_args.disable_disk_cache:
|
||||
if server_args.disable_outlines_disk_cache:
|
||||
from outlines.caching import disable_cache
|
||||
|
||||
disable_cache()
|
||||
@@ -623,8 +624,10 @@ class ModelRunner:
|
||||
if self.server_args.disable_cuda_graph:
|
||||
return
|
||||
|
||||
tic = time.time()
|
||||
logger.info("Capture cuda graph begin. This can take up to several minutes.")
|
||||
self.cuda_graph_runner = CudaGraphRunner(self)
|
||||
logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f}s")
|
||||
|
||||
def apply_torch_tp(self):
|
||||
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
||||
|
||||
@@ -122,7 +122,7 @@ class ServerArgs:
|
||||
disable_jump_forward: bool = False
|
||||
disable_cuda_graph: bool = False
|
||||
disable_cuda_graph_padding: bool = False
|
||||
disable_disk_cache: bool = False
|
||||
disable_outlines_disk_cache: bool = False
|
||||
disable_custom_all_reduce: bool = False
|
||||
disable_mla: bool = False
|
||||
disable_overlap_schedule: bool = False
|
||||
@@ -159,7 +159,7 @@ class ServerArgs:
|
||||
if self.tp_size >= 16:
|
||||
self.mem_fraction_static = 0.79
|
||||
elif self.tp_size >= 8:
|
||||
self.mem_fraction_static = 0.82
|
||||
self.mem_fraction_static = 0.81
|
||||
elif self.tp_size >= 4:
|
||||
self.mem_fraction_static = 0.85
|
||||
elif self.tp_size >= 2:
|
||||
@@ -192,7 +192,7 @@ class ServerArgs:
|
||||
)
|
||||
|
||||
if self.attention_backend == "torch_native":
|
||||
logger.info(
|
||||
logger.warning(
|
||||
"Cuda graph is disabled because of using torch native attention backend"
|
||||
)
|
||||
self.disable_cuda_graph = True
|
||||
@@ -204,12 +204,12 @@ class ServerArgs:
|
||||
self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
|
||||
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
||||
self.disable_overlap_schedule = True
|
||||
logger.info(
|
||||
logger.warning(
|
||||
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
||||
f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
|
||||
f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
|
||||
"Data parallel size is adjusted to be the same as tensor parallel size. "
|
||||
"Overlap schedule is disabled."
|
||||
"Overlap scheduler is disabled."
|
||||
)
|
||||
|
||||
# GGUF
|
||||
@@ -642,9 +642,9 @@ class ServerArgs:
|
||||
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-disk-cache",
|
||||
"--disable-outlines-disk-cache",
|
||||
action="store_true",
|
||||
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
||||
help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-custom-all-reduce",
|
||||
@@ -745,6 +745,11 @@ class ServerArgs:
|
||||
action=DeprecatedAction,
|
||||
help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-disk-cache",
|
||||
action=DeprecatedAction,
|
||||
help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace):
|
||||
|
||||
Reference in New Issue
Block a user