Clean logs for DeepSeek-V3 launching (#6079)
This commit is contained in:
@@ -75,7 +75,8 @@ class PyNcclCommunicator:
|
|||||||
self.available = True
|
self.available = True
|
||||||
self.disabled = False
|
self.disabled = False
|
||||||
|
|
||||||
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
|
if self.rank == 0:
|
||||||
|
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
|
||||||
|
|
||||||
if self.rank == 0:
|
if self.rank == 0:
|
||||||
# get the unique id from NCCL
|
# get the unique id from NCCL
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from sglang.srt.utils import (
|
|||||||
get_device_name,
|
get_device_name,
|
||||||
is_cuda,
|
is_cuda,
|
||||||
is_hip,
|
is_hip,
|
||||||
|
log_info_on_rank0,
|
||||||
)
|
)
|
||||||
|
|
||||||
_is_hip = is_hip()
|
_is_hip = is_hip()
|
||||||
@@ -945,7 +946,9 @@ def get_moe_configs(
|
|||||||
# For example, updating the Triton version might cause all old configs to become suboptimal.
|
# For example, updating the Triton version might cause all old configs to become suboptimal.
|
||||||
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
|
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
|
||||||
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
|
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
|
||||||
logger.info("Using MoE kernel config from %s.", config_file_path)
|
log_info_on_rank0(
|
||||||
|
logger, f"Using MoE kernel config from {config_file_path}."
|
||||||
|
)
|
||||||
# If a configuration has been found, return it
|
# If a configuration has been found, return it
|
||||||
return {int(key): val for key, val in json.load(f).items()}
|
return {int(key): val for key, val in json.load(f).items()}
|
||||||
|
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
|
|||||||
get_bool_env_var,
|
get_bool_env_var,
|
||||||
is_cuda,
|
is_cuda,
|
||||||
is_hip,
|
is_hip,
|
||||||
|
log_info_on_rank0,
|
||||||
print_warning_once,
|
print_warning_once,
|
||||||
set_weight_attrs,
|
set_weight_attrs,
|
||||||
)
|
)
|
||||||
@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
|
|||||||
) -> None:
|
) -> None:
|
||||||
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
|
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
|
||||||
if is_checkpoint_fp8_serialized:
|
if is_checkpoint_fp8_serialized:
|
||||||
logger.warning(
|
log_info_on_rank0(logger, "Detected fp8 checkpoint.")
|
||||||
"Detected fp8 checkpoint. Please note that the "
|
|
||||||
"format is experimental and subject to change."
|
|
||||||
)
|
|
||||||
if activation_scheme not in ACTIVATION_SCHEMES:
|
if activation_scheme not in ACTIVATION_SCHEMES:
|
||||||
raise ValueError(f"Unsupported activation scheme {activation_scheme}")
|
raise ValueError(f"Unsupported activation scheme {activation_scheme}")
|
||||||
self.activation_scheme = activation_scheme
|
self.activation_scheme = activation_scheme
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from sglang.srt.utils import (
|
|||||||
get_device_name,
|
get_device_name,
|
||||||
is_cuda,
|
is_cuda,
|
||||||
is_hip,
|
is_hip,
|
||||||
|
log_info_on_rank0,
|
||||||
supports_custom_op,
|
supports_custom_op,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
|
|||||||
)
|
)
|
||||||
if os.path.exists(config_file_path):
|
if os.path.exists(config_file_path):
|
||||||
with open(config_file_path) as f:
|
with open(config_file_path) as f:
|
||||||
logger.info(
|
log_info_on_rank0(
|
||||||
"Using configuration from %s for W8A8 Block FP8 kernel.",
|
logger,
|
||||||
config_file_path,
|
f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
|
||||||
)
|
)
|
||||||
# If a configuration has been found, return it
|
# If a configuration has been found, return it
|
||||||
return {int(key): val for key, val in json.load(f).items()}
|
return {int(key): val for key, val in json.load(f).items()}
|
||||||
|
|||||||
@@ -278,9 +278,10 @@ class ModelRunner:
|
|||||||
server_args.attention_backend = "fa3"
|
server_args.attention_backend = "fa3"
|
||||||
else:
|
else:
|
||||||
server_args.attention_backend = "triton"
|
server_args.attention_backend = "triton"
|
||||||
logger.info(
|
if self.should_log:
|
||||||
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
|
logger.info(
|
||||||
)
|
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
|
||||||
|
)
|
||||||
elif self.use_mla_backend:
|
elif self.use_mla_backend:
|
||||||
if server_args.device != "cpu":
|
if server_args.device != "cpu":
|
||||||
if server_args.attention_backend in [
|
if server_args.attention_backend in [
|
||||||
@@ -290,9 +291,10 @@ class ModelRunner:
|
|||||||
"flashmla",
|
"flashmla",
|
||||||
"cutlass_mla",
|
"cutlass_mla",
|
||||||
]:
|
]:
|
||||||
logger.info(
|
if self.should_log:
|
||||||
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
|
logger.info(
|
||||||
)
|
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid attention backend for MLA: {server_args.attention_backend}"
|
f"Invalid attention backend for MLA: {server_args.attention_backend}"
|
||||||
@@ -311,9 +313,10 @@ class ModelRunner:
|
|||||||
server_args.attention_backend = "triton"
|
server_args.attention_backend = "triton"
|
||||||
|
|
||||||
if server_args.enable_double_sparsity:
|
if server_args.enable_double_sparsity:
|
||||||
logger.info(
|
if self.should_log:
|
||||||
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
|
logger.info(
|
||||||
)
|
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
|
||||||
|
)
|
||||||
server_args.attention_backend = "triton"
|
server_args.attention_backend = "triton"
|
||||||
server_args.disable_cuda_graph = True
|
server_args.disable_cuda_graph = True
|
||||||
if server_args.ds_heavy_channel_type is None:
|
if server_args.ds_heavy_channel_type is None:
|
||||||
@@ -324,23 +327,26 @@ class ModelRunner:
|
|||||||
|
|
||||||
if self.is_multimodal:
|
if self.is_multimodal:
|
||||||
self.mem_fraction_static *= 0.90
|
self.mem_fraction_static *= 0.90
|
||||||
logger.info(
|
if self.should_log:
|
||||||
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
|
logger.info(
|
||||||
f"because this is a multimodal model."
|
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
|
||||||
)
|
f"because this is a multimodal model."
|
||||||
logger.info(
|
)
|
||||||
"Automatically turn off --chunked-prefill-size for multimodal model."
|
logger.info(
|
||||||
)
|
"Automatically turn off --chunked-prefill-size for multimodal model."
|
||||||
|
)
|
||||||
server_args.chunked_prefill_size = -1
|
server_args.chunked_prefill_size = -1
|
||||||
|
|
||||||
if not self.use_mla_backend:
|
if not self.use_mla_backend:
|
||||||
server_args.disable_chunked_prefix_cache = True
|
server_args.disable_chunked_prefix_cache = True
|
||||||
elif self.page_size > 1:
|
elif self.page_size > 1:
|
||||||
logger.info("Disable chunked prefix cache when page size > 1.")
|
if self.should_log:
|
||||||
|
logger.info("Disable chunked prefix cache when page size > 1.")
|
||||||
server_args.disable_chunked_prefix_cache = True
|
server_args.disable_chunked_prefix_cache = True
|
||||||
|
|
||||||
if not server_args.disable_chunked_prefix_cache:
|
if not server_args.disable_chunked_prefix_cache:
|
||||||
logger.info("Chunked prefix cache is turned on.")
|
if self.should_log:
|
||||||
|
logger.info("Chunked prefix cache is turned on.")
|
||||||
|
|
||||||
def init_torch_distributed(self):
|
def init_torch_distributed(self):
|
||||||
logger.info("Init torch distributed begin.")
|
logger.info("Init torch distributed begin.")
|
||||||
@@ -433,9 +439,10 @@ class ModelRunner:
|
|||||||
torch.set_num_threads(1)
|
torch.set_num_threads(1)
|
||||||
if self.device == "cuda":
|
if self.device == "cuda":
|
||||||
if torch.cuda.get_device_capability()[0] < 8:
|
if torch.cuda.get_device_capability()[0] < 8:
|
||||||
logger.info(
|
if self.should_log:
|
||||||
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
|
logger.info(
|
||||||
)
|
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
|
||||||
|
)
|
||||||
self.server_args.dtype = "float16"
|
self.server_args.dtype = "float16"
|
||||||
self.model_config.dtype = torch.float16
|
self.model_config.dtype = torch.float16
|
||||||
if torch.cuda.get_device_capability()[1] < 5:
|
if torch.cuda.get_device_capability()[1] < 5:
|
||||||
@@ -471,10 +478,11 @@ class ModelRunner:
|
|||||||
self.model.load_kv_cache_scales(
|
self.model.load_kv_cache_scales(
|
||||||
self.server_args.quantization_param_path
|
self.server_args.quantization_param_path
|
||||||
)
|
)
|
||||||
logger.info(
|
if self.should_log:
|
||||||
"Loaded KV cache scaling factors from %s",
|
logger.info(
|
||||||
self.server_args.quantization_param_path,
|
"Loaded KV cache scaling factors from %s",
|
||||||
)
|
self.server_args.quantization_param_path,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Using FP8 KV cache and scaling factors provided but "
|
"Using FP8 KV cache and scaling factors provided but "
|
||||||
@@ -1021,7 +1029,8 @@ class ModelRunner:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def apply_torch_tp(self):
|
def apply_torch_tp(self):
|
||||||
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
if self.should_log:
|
||||||
|
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
||||||
from sglang.srt.model_parallel import tensor_parallel
|
from sglang.srt.model_parallel import tensor_parallel
|
||||||
|
|
||||||
device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
|
device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ from sglang.srt.utils import (
|
|||||||
get_int_env_var,
|
get_int_env_var,
|
||||||
is_cuda,
|
is_cuda,
|
||||||
is_hip,
|
is_hip,
|
||||||
|
log_info_on_rank0,
|
||||||
)
|
)
|
||||||
|
|
||||||
_is_hip = is_hip()
|
_is_hip = is_hip()
|
||||||
@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|||||||
):
|
):
|
||||||
self.n_share_experts_fusion = 0
|
self.n_share_experts_fusion = 0
|
||||||
global_server_args_dict["n_share_experts_fusion"] = 0
|
global_server_args_dict["n_share_experts_fusion"] = 0
|
||||||
logger.info(
|
log_info_on_rank0(
|
||||||
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
|
logger,
|
||||||
|
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled.",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
assert (
|
assert (
|
||||||
@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|||||||
):
|
):
|
||||||
self.n_share_experts_fusion = self.tp_size
|
self.n_share_experts_fusion = self.tp_size
|
||||||
global_server_args_dict["n_share_experts_fusion"] = self.tp_size
|
global_server_args_dict["n_share_experts_fusion"] = self.tp_size
|
||||||
logger.info(
|
log_info_on_rank0(
|
||||||
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
|
logger,
|
||||||
|
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.",
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_input_embeddings(self) -> nn.Embedding:
|
def get_input_embeddings(self) -> nn.Embedding:
|
||||||
|
|||||||
@@ -2096,3 +2096,10 @@ class BumpAllocator:
|
|||||||
output = self._buffer[self._pointer : self._pointer + size]
|
output = self._buffer[self._pointer : self._pointer + size]
|
||||||
self._pointer += size
|
self._pointer += size
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def log_info_on_rank0(logger, msg):
|
||||||
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
||||||
|
|
||||||
|
if get_tensor_model_parallel_rank() == 0:
|
||||||
|
logger.info(msg)
|
||||||
|
|||||||
Reference in New Issue
Block a user