Fix logging (#796)
This commit is contained in:
@@ -18,7 +18,7 @@ except ImportError as e:
|
||||
openai = tiktoken = e
|
||||
|
||||
|
||||
logger = logging.getLogger("openai")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_logit_bias_int(tokenizer):
|
||||
|
||||
@@ -15,6 +15,7 @@ limitations under the License.
|
||||
|
||||
"""Meta data for requests and batches"""
|
||||
|
||||
import logging
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum, auto
|
||||
@@ -40,6 +41,9 @@ global_server_args_dict = {
|
||||
}
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ForwardMode(IntEnum):
|
||||
# Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
|
||||
PREFILL = auto()
|
||||
@@ -379,7 +383,7 @@ class Batch:
|
||||
out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
|
||||
|
||||
if out_cache_loc is None:
|
||||
print("Prefill out of memory. This should never happen.")
|
||||
logger.error("Prefill out of memory. This should never happen.")
|
||||
self.tree_cache.pretty_print()
|
||||
exit()
|
||||
|
||||
@@ -613,7 +617,7 @@ class Batch:
|
||||
self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
|
||||
|
||||
if self.out_cache_loc is None:
|
||||
print("Decode out of memory. This should never happen.")
|
||||
logger.error("Decode out of memory. This should never happen.")
|
||||
self.tree_cache.pretty_print()
|
||||
exit()
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import kill_parent_process
|
||||
from sglang.utils import get_exception_traceback
|
||||
|
||||
logger = logging.getLogger("srt.controller")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LoadBalanceMethod(Enum):
|
||||
|
||||
@@ -31,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import kill_parent_process
|
||||
from sglang.utils import get_exception_traceback
|
||||
|
||||
logger = logging.getLogger("srt.controller")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ControllerSingle:
|
||||
|
||||
@@ -57,7 +57,7 @@ from sglang.srt.utils import (
|
||||
monkey_patch_vllm_qvk_linear_loader,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("srt.model_runner")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelRunner:
|
||||
@@ -90,7 +90,7 @@ class ModelRunner:
|
||||
|
||||
# Init torch distributed
|
||||
torch.cuda.set_device(self.gpu_id)
|
||||
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
|
||||
logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")
|
||||
|
||||
if not server_args.enable_p2p_check:
|
||||
monkey_patch_vllm_p2p_access_check(self.gpu_id)
|
||||
@@ -130,7 +130,7 @@ class ModelRunner:
|
||||
|
||||
def load_model(self):
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] Load weight begin. "
|
||||
f"[gpu={self.gpu_id}] Load weight begin. "
|
||||
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
||||
)
|
||||
|
||||
@@ -178,7 +178,7 @@ class ModelRunner:
|
||||
cache_config=None,
|
||||
)
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] Load weight end. "
|
||||
f"[gpu={self.gpu_id}] Load weight end. "
|
||||
f"type={type(self.model).__name__}, "
|
||||
f"dtype={self.dtype}, "
|
||||
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
||||
@@ -229,7 +229,7 @@ class ModelRunner:
|
||||
layer_num=self.model_config.num_hidden_layers,
|
||||
)
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] Memory pool end. "
|
||||
f"[gpu={self.gpu_id}] Memory pool end. "
|
||||
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
||||
)
|
||||
|
||||
@@ -280,7 +280,7 @@ class ModelRunner:
|
||||
return
|
||||
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
|
||||
f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
|
||||
)
|
||||
batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
|
||||
self.cuda_graph_runner = CudaGraphRunner(
|
||||
|
||||
@@ -55,7 +55,7 @@ from sglang.srt.utils import (
|
||||
)
|
||||
from sglang.utils import get_exception_traceback
|
||||
|
||||
logger = logging.getLogger("srt.tp_worker")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelTpServer:
|
||||
@@ -132,7 +132,7 @@ class ModelTpServer:
|
||||
|
||||
# Print info
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] "
|
||||
f"[gpu={self.gpu_id}] "
|
||||
f"max_total_num_tokens={self.max_total_num_tokens}, "
|
||||
f"max_prefill_tokens={self.max_prefill_tokens}, "
|
||||
f"max_running_requests={self.max_running_requests}, "
|
||||
@@ -256,7 +256,7 @@ class ModelTpServer:
|
||||
self.num_generated_tokens = 0
|
||||
self.last_stats_tic = time.time()
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] Decode batch. "
|
||||
f"[gpu={self.gpu_id}] Decode batch. "
|
||||
f"#running-req: {len(self.running_batch.reqs)}, "
|
||||
f"#token: {num_used}, "
|
||||
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
||||
@@ -434,7 +434,7 @@ class ModelTpServer:
|
||||
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
||||
)
|
||||
logger.info(
|
||||
f"[gpu_id={self.gpu_id}] Prefill batch. "
|
||||
f"[gpu={self.gpu_id}] Prefill batch. "
|
||||
f"#new-seq: {len(can_run_list)}, "
|
||||
f"#new-token: {new_batch_input_tokens}, "
|
||||
f"#cached-token: {hit_tokens}, "
|
||||
|
||||
@@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
|
||||
|
||||
from sglang.srt.layers.quantization import get_quantization_config
|
||||
|
||||
logger = logging.getLogger("srt.model_loader")
|
||||
logger = logging.getLogger(__name__)
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user