Clean logs for DeepSeek-V3 launching (#6079)
This commit is contained in:
@@ -75,7 +75,8 @@ class PyNcclCommunicator:
|
||||
self.available = True
|
||||
self.disabled = False
|
||||
|
||||
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
|
||||
if self.rank == 0:
|
||||
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
|
||||
|
||||
if self.rank == 0:
|
||||
# get the unique id from NCCL
|
||||
|
||||
@@ -29,6 +29,7 @@ from sglang.srt.utils import (
|
||||
get_device_name,
|
||||
is_cuda,
|
||||
is_hip,
|
||||
log_info_on_rank0,
|
||||
)
|
||||
|
||||
_is_hip = is_hip()
|
||||
@@ -945,7 +946,9 @@ def get_moe_configs(
|
||||
# For example, updating the Triton version might cause all old configs to become suboptimal.
|
||||
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
|
||||
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
|
||||
logger.info("Using MoE kernel config from %s.", config_file_path)
|
||||
log_info_on_rank0(
|
||||
logger, f"Using MoE kernel config from {config_file_path}."
|
||||
)
|
||||
# If a configuration has been found, return it
|
||||
return {int(key): val for key, val in json.load(f).items()}
|
||||
|
||||
|
||||
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
|
||||
get_bool_env_var,
|
||||
is_cuda,
|
||||
is_hip,
|
||||
log_info_on_rank0,
|
||||
print_warning_once,
|
||||
set_weight_attrs,
|
||||
)
|
||||
@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
|
||||
) -> None:
|
||||
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
|
||||
if is_checkpoint_fp8_serialized:
|
||||
logger.warning(
|
||||
"Detected fp8 checkpoint. Please note that the "
|
||||
"format is experimental and subject to change."
|
||||
)
|
||||
log_info_on_rank0(logger, "Detected fp8 checkpoint.")
|
||||
if activation_scheme not in ACTIVATION_SCHEMES:
|
||||
raise ValueError(f"Unsupported activation scheme {activation_scheme}")
|
||||
self.activation_scheme = activation_scheme
|
||||
|
||||
@@ -30,6 +30,7 @@ from sglang.srt.utils import (
|
||||
get_device_name,
|
||||
is_cuda,
|
||||
is_hip,
|
||||
log_info_on_rank0,
|
||||
supports_custom_op,
|
||||
)
|
||||
|
||||
@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
|
||||
)
|
||||
if os.path.exists(config_file_path):
|
||||
with open(config_file_path) as f:
|
||||
logger.info(
|
||||
"Using configuration from %s for W8A8 Block FP8 kernel.",
|
||||
config_file_path,
|
||||
log_info_on_rank0(
|
||||
logger,
|
||||
f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
|
||||
)
|
||||
# If a configuration has been found, return it
|
||||
return {int(key): val for key, val in json.load(f).items()}
|
||||
|
||||
@@ -278,9 +278,10 @@ class ModelRunner:
|
||||
server_args.attention_backend = "fa3"
|
||||
else:
|
||||
server_args.attention_backend = "triton"
|
||||
logger.info(
|
||||
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
|
||||
)
|
||||
if self.should_log:
|
||||
logger.info(
|
||||
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
|
||||
)
|
||||
elif self.use_mla_backend:
|
||||
if server_args.device != "cpu":
|
||||
if server_args.attention_backend in [
|
||||
@@ -290,9 +291,10 @@ class ModelRunner:
|
||||
"flashmla",
|
||||
"cutlass_mla",
|
||||
]:
|
||||
logger.info(
|
||||
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
|
||||
)
|
||||
if self.should_log:
|
||||
logger.info(
|
||||
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid attention backend for MLA: {server_args.attention_backend}"
|
||||
@@ -311,9 +313,10 @@ class ModelRunner:
|
||||
server_args.attention_backend = "triton"
|
||||
|
||||
if server_args.enable_double_sparsity:
|
||||
logger.info(
|
||||
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
|
||||
)
|
||||
if self.should_log:
|
||||
logger.info(
|
||||
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
|
||||
)
|
||||
server_args.attention_backend = "triton"
|
||||
server_args.disable_cuda_graph = True
|
||||
if server_args.ds_heavy_channel_type is None:
|
||||
@@ -324,23 +327,26 @@ class ModelRunner:
|
||||
|
||||
if self.is_multimodal:
|
||||
self.mem_fraction_static *= 0.90
|
||||
logger.info(
|
||||
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
|
||||
f"because this is a multimodal model."
|
||||
)
|
||||
logger.info(
|
||||
"Automatically turn off --chunked-prefill-size for multimodal model."
|
||||
)
|
||||
if self.should_log:
|
||||
logger.info(
|
||||
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
|
||||
f"because this is a multimodal model."
|
||||
)
|
||||
logger.info(
|
||||
"Automatically turn off --chunked-prefill-size for multimodal model."
|
||||
)
|
||||
server_args.chunked_prefill_size = -1
|
||||
|
||||
if not self.use_mla_backend:
|
||||
server_args.disable_chunked_prefix_cache = True
|
||||
elif self.page_size > 1:
|
||||
logger.info("Disable chunked prefix cache when page size > 1.")
|
||||
if self.should_log:
|
||||
logger.info("Disable chunked prefix cache when page size > 1.")
|
||||
server_args.disable_chunked_prefix_cache = True
|
||||
|
||||
if not server_args.disable_chunked_prefix_cache:
|
||||
logger.info("Chunked prefix cache is turned on.")
|
||||
if self.should_log:
|
||||
logger.info("Chunked prefix cache is turned on.")
|
||||
|
||||
def init_torch_distributed(self):
|
||||
logger.info("Init torch distributed begin.")
|
||||
@@ -433,9 +439,10 @@ class ModelRunner:
|
||||
torch.set_num_threads(1)
|
||||
if self.device == "cuda":
|
||||
if torch.cuda.get_device_capability()[0] < 8:
|
||||
logger.info(
|
||||
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
|
||||
)
|
||||
if self.should_log:
|
||||
logger.info(
|
||||
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
|
||||
)
|
||||
self.server_args.dtype = "float16"
|
||||
self.model_config.dtype = torch.float16
|
||||
if torch.cuda.get_device_capability()[1] < 5:
|
||||
@@ -471,10 +478,11 @@ class ModelRunner:
|
||||
self.model.load_kv_cache_scales(
|
||||
self.server_args.quantization_param_path
|
||||
)
|
||||
logger.info(
|
||||
"Loaded KV cache scaling factors from %s",
|
||||
self.server_args.quantization_param_path,
|
||||
)
|
||||
if self.should_log:
|
||||
logger.info(
|
||||
"Loaded KV cache scaling factors from %s",
|
||||
self.server_args.quantization_param_path,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Using FP8 KV cache and scaling factors provided but "
|
||||
@@ -1021,7 +1029,8 @@ class ModelRunner:
|
||||
)
|
||||
|
||||
def apply_torch_tp(self):
|
||||
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
||||
if self.should_log:
|
||||
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
||||
from sglang.srt.model_parallel import tensor_parallel
|
||||
|
||||
device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
|
||||
|
||||
@@ -88,6 +88,7 @@ from sglang.srt.utils import (
|
||||
get_int_env_var,
|
||||
is_cuda,
|
||||
is_hip,
|
||||
log_info_on_rank0,
|
||||
)
|
||||
|
||||
_is_hip = is_hip()
|
||||
@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
):
|
||||
self.n_share_experts_fusion = 0
|
||||
global_server_args_dict["n_share_experts_fusion"] = 0
|
||||
logger.info(
|
||||
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
|
||||
log_info_on_rank0(
|
||||
logger,
|
||||
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled.",
|
||||
)
|
||||
else:
|
||||
assert (
|
||||
@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
):
|
||||
self.n_share_experts_fusion = self.tp_size
|
||||
global_server_args_dict["n_share_experts_fusion"] = self.tp_size
|
||||
logger.info(
|
||||
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
|
||||
log_info_on_rank0(
|
||||
logger,
|
||||
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.",
|
||||
)
|
||||
|
||||
def get_input_embeddings(self) -> nn.Embedding:
|
||||
|
||||
@@ -2096,3 +2096,10 @@ class BumpAllocator:
|
||||
output = self._buffer[self._pointer : self._pointer + size]
|
||||
self._pointer += size
|
||||
return output
|
||||
|
||||
|
||||
def log_info_on_rank0(logger, msg):
|
||||
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
||||
|
||||
if get_tensor_model_parallel_rank() == 0:
|
||||
logger.info(msg)
|
||||
|
||||
Reference in New Issue
Block a user