From 73600673bb1dd87b6c31d88cef17c9c8a0cce4ad Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Wed, 7 May 2025 18:54:50 -0700 Subject: [PATCH] Clean logs for DeepSeek-V3 launching (#6079) --- .../device_communicators/pynccl.py | 3 +- .../layers/moe/fused_moe_triton/fused_moe.py | 5 +- python/sglang/srt/layers/quantization/fp8.py | 6 +- .../srt/layers/quantization/fp8_kernel.py | 7 ++- .../sglang/srt/model_executor/model_runner.py | 61 +++++++++++-------- python/sglang/srt/models/deepseek_v2.py | 11 ++-- python/sglang/srt/utils.py | 7 +++ 7 files changed, 61 insertions(+), 39 deletions(-) diff --git a/python/sglang/srt/distributed/device_communicators/pynccl.py b/python/sglang/srt/distributed/device_communicators/pynccl.py index 9f65939f6..6459f70fd 100644 --- a/python/sglang/srt/distributed/device_communicators/pynccl.py +++ b/python/sglang/srt/distributed/device_communicators/pynccl.py @@ -75,7 +75,8 @@ class PyNcclCommunicator: self.available = True self.disabled = False - logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion()) + if self.rank == 0: + logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion()) if self.rank == 0: # get the unique id from NCCL diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 6965b43a2..39d52cb53 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -29,6 +29,7 @@ from sglang.srt.utils import ( get_device_name, is_cuda, is_hip, + log_info_on_rank0, ) _is_hip = is_hip() @@ -945,7 +946,9 @@ def get_moe_configs( # For example, updating the Triton version might cause all old configs to become suboptimal. # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment. # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton - logger.info("Using MoE kernel config from %s.", config_file_path) + log_info_on_rank0( + logger, f"Using MoE kernel config from {config_file_path}." + ) # If a configuration has been found, return it return {int(key): val for key, val in json.load(f).items()} diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index b5fdccb88..c36dbf295 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -66,6 +66,7 @@ from sglang.srt.utils import ( get_bool_env_var, is_cuda, is_hip, + log_info_on_rank0, print_warning_once, set_weight_attrs, ) @@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig): ) -> None: self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized if is_checkpoint_fp8_serialized: - logger.warning( - "Detected fp8 checkpoint. Please note that the " - "format is experimental and subject to change." - ) + log_info_on_rank0(logger, "Detected fp8 checkpoint.") if activation_scheme not in ACTIVATION_SCHEMES: raise ValueError(f"Unsupported activation scheme {activation_scheme}") self.activation_scheme = activation_scheme diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py index e52f69142..d1a0ffa91 100644 --- a/python/sglang/srt/layers/quantization/fp8_kernel.py +++ b/python/sglang/srt/layers/quantization/fp8_kernel.py @@ -30,6 +30,7 @@ from sglang.srt.utils import ( get_device_name, is_cuda, is_hip, + log_info_on_rank0, supports_custom_op, ) @@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs( ) if os.path.exists(config_file_path): with open(config_file_path) as f: - logger.info( - "Using configuration from %s for W8A8 Block FP8 kernel.", - config_file_path, + log_info_on_rank0( + logger, + f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.", ) # If a configuration has been found, return it return {int(key): val for key, val in json.load(f).items()} diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 589cc9b06..ff08f182a 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -278,9 +278,10 @@ class ModelRunner: server_args.attention_backend = "fa3" else: server_args.attention_backend = "triton" - logger.info( - f"Attention backend not set. Use {server_args.attention_backend} backend by default." - ) + if self.should_log: + logger.info( + f"Attention backend not set. Use {server_args.attention_backend} backend by default." + ) elif self.use_mla_backend: if server_args.device != "cpu": if server_args.attention_backend in [ @@ -290,9 +291,10 @@ class ModelRunner: "flashmla", "cutlass_mla", ]: - logger.info( - f"MLA optimization is turned on. Use {server_args.attention_backend} backend." - ) + if self.should_log: + logger.info( + f"MLA optimization is turned on. Use {server_args.attention_backend} backend." + ) else: raise ValueError( f"Invalid attention backend for MLA: {server_args.attention_backend}" @@ -311,9 +313,10 @@ class ModelRunner: server_args.attention_backend = "triton" if server_args.enable_double_sparsity: - logger.info( - "Double sparsity optimization is turned on. Use triton backend without CUDA graph." - ) + if self.should_log: + logger.info( + "Double sparsity optimization is turned on. Use triton backend without CUDA graph." + ) server_args.attention_backend = "triton" server_args.disable_cuda_graph = True if server_args.ds_heavy_channel_type is None: @@ -324,23 +327,26 @@ class ModelRunner: if self.is_multimodal: self.mem_fraction_static *= 0.90 - logger.info( - f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} " - f"because this is a multimodal model." - ) - logger.info( - "Automatically turn off --chunked-prefill-size for multimodal model." - ) + if self.should_log: + logger.info( + f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} " + f"because this is a multimodal model." + ) + logger.info( + "Automatically turn off --chunked-prefill-size for multimodal model." + ) server_args.chunked_prefill_size = -1 if not self.use_mla_backend: server_args.disable_chunked_prefix_cache = True elif self.page_size > 1: - logger.info("Disable chunked prefix cache when page size > 1.") + if self.should_log: + logger.info("Disable chunked prefix cache when page size > 1.") server_args.disable_chunked_prefix_cache = True if not server_args.disable_chunked_prefix_cache: - logger.info("Chunked prefix cache is turned on.") + if self.should_log: + logger.info("Chunked prefix cache is turned on.") def init_torch_distributed(self): logger.info("Init torch distributed begin.") @@ -433,9 +439,10 @@ class ModelRunner: torch.set_num_threads(1) if self.device == "cuda": if torch.cuda.get_device_capability()[0] < 8: - logger.info( - "Compute capability below sm80. Use float16 due to lack of bfloat16 support." - ) + if self.should_log: + logger.info( + "Compute capability below sm80. Use float16 due to lack of bfloat16 support." + ) self.server_args.dtype = "float16" self.model_config.dtype = torch.float16 if torch.cuda.get_device_capability()[1] < 5: @@ -471,10 +478,11 @@ class ModelRunner: self.model.load_kv_cache_scales( self.server_args.quantization_param_path ) - logger.info( - "Loaded KV cache scaling factors from %s", - self.server_args.quantization_param_path, - ) + if self.should_log: + logger.info( + "Loaded KV cache scaling factors from %s", + self.server_args.quantization_param_path, + ) else: raise RuntimeError( "Using FP8 KV cache and scaling factors provided but " @@ -1021,7 +1029,8 @@ class ModelRunner: ) def apply_torch_tp(self): - logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.") + if self.should_log: + logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.") from sglang.srt.model_parallel import tensor_parallel device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,)) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 2d5906f80..92c5057db 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -88,6 +88,7 @@ from sglang.srt.utils import ( get_int_env_var, is_cuda, is_hip, + log_info_on_rank0, ) _is_hip = is_hip() @@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module): ): self.n_share_experts_fusion = 0 global_server_args_dict["n_share_experts_fusion"] = 0 - logger.info( - "Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled." + log_info_on_rank0( + logger, + "Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled.", ) else: assert ( @@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module): ): self.n_share_experts_fusion = self.tp_size global_server_args_dict["n_share_experts_fusion"] = self.tp_size - logger.info( - "Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled." + log_info_on_rank0( + logger, + "Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.", ) def get_input_embeddings(self) -> nn.Embedding: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index d137e4eac..2c41076f5 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -2096,3 +2096,10 @@ class BumpAllocator: output = self._buffer[self._pointer : self._pointer + size] self._pointer += size return output + + +def log_info_on_rank0(logger, msg): + from sglang.srt.distributed import get_tensor_model_parallel_rank + + if get_tensor_model_parallel_rank() == 0: + logger.info(msg)