diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md index 2e5277a3f..3f660318e 100644 --- a/docs/references/environment_variables.md +++ b/docs/references/environment_variables.md @@ -82,7 +82,7 @@ SGLang supports various environment variables that can be used to configure its | Environment Variable | Description | Default Value | | --- | --- | --- | | `SGLANG_IS_IN_CI` | Indicates if running in CI environment | `false` | -| `SGLANG_AMD_CI` | Indicates running in AMD CI environment | `0` | +| `SGLANG_IS_IN_CI_AMD` | Indicates running in AMD CI environment | `0` | | `SGLANG_TEST_RETRACT` | Enable retract decode testing | `false` | | `SGLANG_RECORD_STEP_TIME` | Record step time for profiling | `false` | | `SGLANG_TEST_REQUEST_TIME_STATS` | Test request time statistics | `false` | diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py index 6836c9bc9..bb7128206 100644 --- a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py +++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py @@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import is_weak_contiguous, ) from sglang.srt.distributed.parallel_state import in_the_same_node_as -from sglang.srt.utils import is_cuda, is_hip +from sglang.srt.utils import is_cuda, is_hip, log_info_on_rank0 logger = logging.getLogger(__name__) @@ -301,11 +301,11 @@ class CustomAllreduce: if _is_hip: handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) handles, offsets = self._gather_ipc_meta((bytes(handle), offset)) - logger.info("Registering %d cuda graph addresses", len(offset)) + log_info_on_rank0(logger, f"Registering {len(offset)} cuda graph addresses") ops.register_graph_buffers(self._ptr, handles, offsets) else: handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) - logger.info("Registering %d cuda graph addresses", len(offset)) + log_info_on_rank0(logger, f"Registering {len(offset)} cuda graph addresses") # We cannot directly use `dist.all_gather_object` here # because it is incompatible with `gloo` backend under inference mode. # see https://github.com/pytorch/pytorch/issues/126032 for details. diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 06b68c523..7a31365ff 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -113,7 +113,7 @@ class Envs: # Test & Debug SGLANG_IS_IN_CI = EnvBool(False) - SGLANG_AMD_CI = EnvBool(False) + SGLANG_IS_IN_CI_AMD = EnvBool(False) SGLANG_TEST_RETRACT = EnvBool(False) SGLANG_SET_CPU_AFFINITY = EnvBool(False) SGLANG_PROFILE_WITH_STACK = EnvBool(True) @@ -197,12 +197,12 @@ class Envs: # sgl-kernel SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False) - # vLLM dependencies + # vLLM dependencies (TODO: they have been deprecated, we can remove them safely) USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False) USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False) USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False) - RETURN_ORIGINAL_LOGPROB = EnvBool(False) + SGLANG_RETURN_ORIGINAL_LOGPROB = EnvBool(False) SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False) SGLANG_MOE_PADDING = EnvBool(False) SGLANG_CUTLASS_MOE = EnvBool(False) diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index a0cf55b0e..e108866df 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -65,7 +65,7 @@ class LogitsProcessorOutput: hidden_states: Optional[torch.Tensor] = None ## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler - # he log probs of output tokens, if RETURN_ORIGINAL_LOGPROB = True, will get the log probs before applying temperature. If False, will get the log probs before applying temperature. + # he log probs of output tokens, if SGLANG_RETURN_ORIGINAL_LOGPROB = True, will get the log probs before applying temperature. If False, will get the log probs before applying temperature. next_token_logprobs: Optional[torch.Tensor] = None # The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k] next_token_top_logprobs_val: Optional[List] = None diff --git a/python/sglang/srt/layers/moe/utils.py b/python/sglang/srt/layers/moe/utils.py index ac743dd86..651499363 100644 --- a/python/sglang/srt/layers/moe/utils.py +++ b/python/sglang/srt/layers/moe/utils.py @@ -13,6 +13,7 @@ from sglang.srt.layers.dp_attention import ( get_attention_dp_size, is_dp_attention_enabled, ) +from sglang.srt.utils import log_info_on_rank0 if TYPE_CHECKING: from sglang.srt.server_args import ServerArgs @@ -159,8 +160,9 @@ def get_moe_a2a_backend() -> MoeA2ABackend: def get_moe_runner_backend() -> MoeRunnerBackend: global MOE_RUNNER_BACKEND if MOE_RUNNER_BACKEND is None: - logger.warning( - "MOE_RUNNER_BACKEND is not initialized, the backend will be automatically selected" + log_info_on_rank0( + logger, + "MOE_RUNNER_BACKEND is not initialized, the backend will be automatically selected", ) MOE_RUNNER_BACKEND = MoeRunnerBackend.AUTO return MOE_RUNNER_BACKEND diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py index c87c9ac72..8ff1b8681 100644 --- a/python/sglang/srt/layers/sampler.py +++ b/python/sglang/srt/layers/sampler.py @@ -27,7 +27,7 @@ if is_cuda(): logger = logging.getLogger(__name__) SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP") -RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB") +SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") class Sampler(nn.Module): @@ -99,7 +99,7 @@ class Sampler(nn.Module): ) # If requested, cache probabilities from original logits before temperature scaling. - if return_logprob and RETURN_ORIGINAL_LOGPROB: + if return_logprob and SGLANG_RETURN_ORIGINAL_LOGPROB: probs_without_temp_scaling = torch.softmax(logits, dim=-1) # Post process logits @@ -149,7 +149,7 @@ class Sampler(nn.Module): if return_logprob: # clamp to avoid -inf - if RETURN_ORIGINAL_LOGPROB: + if SGLANG_RETURN_ORIGINAL_LOGPROB: logprobs = torch.log(probs_without_temp_scaling).clamp( min=torch.finfo(probs_without_temp_scaling.dtype).min ) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 39ee02aaf..4575d2091 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -286,8 +286,6 @@ class ModelRunner: self.forward_pass_id = 0 # Apply the rank zero filter to logger - if not any(isinstance(f, RankZeroFilter) for f in logger.filters): - logger.addFilter(RankZeroFilter(tp_rank == 0)) if server_args.show_time_cost: enable_show_time_cost() @@ -577,8 +575,9 @@ class ModelRunner: server_args.attention_backend = "ascend" else: server_args.attention_backend = "triton" - logger.info( - f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default." + log_info_on_rank0( + logger, + f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default.", ) elif self.use_mla_backend: if server_args.device != "cpu": diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index 577d051b7..900a37074 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -38,7 +38,7 @@ from sglang.srt.distributed import get_tensor_model_parallel_rank from sglang.srt.layers.dp_attention import get_attention_tp_rank from sglang.srt.layers.quantization import QuantizationConfig, get_quantization_config from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp4Config -from sglang.srt.utils import find_local_repo_dir, print_warning_once +from sglang.srt.utils import find_local_repo_dir, log_info_on_rank0, print_warning_once from sglang.utils import is_in_ci logger = logging.getLogger(__name__) @@ -429,7 +429,7 @@ def download_weights_from_hf( allow_patterns = [pattern] break - logger.info("Using model weights format %s", allow_patterns) + log_info_on_rank0(logger, f"Using model weights format {allow_patterns}") # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 8d179b2c7..dbfd48efb 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2484,13 +2484,7 @@ class ServerArgs: default=ServerArgs.mamba_full_memory_ratio, help="The ratio of mamba state memory to full kv cache memory.", ) - # Args for multi-item-scoring - parser.add_argument( - "--multi-item-scoring-delimiter", - type=int, - default=ServerArgs.multi_item_scoring_delimiter, - help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: QueryItem1Item2... This enables efficient batch processing of multiple items against a single query.", - ) + # Hierarchical cache parser.add_argument( "--enable-hierarchical-cache", @@ -2636,6 +2630,14 @@ class ServerArgs: help="Mode of offloading.", ) + # Args for multi-item-scoring + parser.add_argument( + "--multi-item-scoring-delimiter", + type=int, + default=ServerArgs.multi_item_scoring_delimiter, + help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: QueryItem1Item2... This enables efficient batch processing of multiple items against a single query.", + ) + # Optimization/debug options parser.add_argument( "--disable-radix-cache", diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py index 736f65dad..ce01187c2 100644 --- a/python/sglang/srt/speculative/eagle_worker.py +++ b/python/sglang/srt/speculative/eagle_worker.py @@ -64,7 +64,7 @@ if is_cuda(): from sgl_kernel import segment_packbits logger = logging.getLogger(__name__) -RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB") +SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") @contextmanager @@ -741,7 +741,7 @@ class EAGLEWorker(TpModelWorker): # acceptance indices are the indices in a "flattened" batch. # dividing it to num_draft_tokens will yield the actual batch index. temperatures = temperatures[accepted_indices // num_draft_tokens] - if RETURN_ORIGINAL_LOGPROB: + if SGLANG_RETURN_ORIGINAL_LOGPROB: logprobs = torch.nn.functional.log_softmax( logits_output.next_token_logits, dim=-1 ) diff --git a/python/sglang/srt/speculative/spec_info.py b/python/sglang/srt/speculative/spec_info.py index 389d57ed1..47d8be80e 100644 --- a/python/sglang/srt/speculative/spec_info.py +++ b/python/sglang/srt/speculative/spec_info.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from enum import IntEnum, auto +from functools import lru_cache from typing import List, Tuple from sglang.srt.managers.schedule_batch import ModelWorkerBatch @@ -27,6 +28,7 @@ class SpeculativeAlgorithm(IntEnum): def is_ngram(self): return self == SpeculativeAlgorithm.NGRAM + @lru_cache(maxsize=None) @staticmethod def from_string(name: str): name_map = { diff --git a/python/sglang/srt/speculative/standalone_worker.py b/python/sglang/srt/speculative/standalone_worker.py index b6004ea01..ca0490e39 100644 --- a/python/sglang/srt/speculative/standalone_worker.py +++ b/python/sglang/srt/speculative/standalone_worker.py @@ -15,7 +15,7 @@ if is_cuda(): from sgl_kernel import segment_packbits logger = logging.getLogger(__name__) -RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB") +SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB") @contextmanager diff --git a/python/sglang/test/send_one.py b/python/sglang/test/send_one.py index a0aec308d..061a4d05b 100644 --- a/python/sglang/test/send_one.py +++ b/python/sglang/test/send_one.py @@ -3,6 +3,8 @@ Run one test prompt. Usage: python3 -m sglang.test.send_one +python3 -m sglang.test.send_one --profile --profile-steps 5 +python3 -m sglang.test.send_one --profile --profile-by-stage """ import argparse @@ -11,6 +13,8 @@ import json import requests +from sglang.profiler import run_profile + @dataclasses.dataclass class BenchArgs: @@ -29,6 +33,9 @@ class BenchArgs: image: bool = False many_images: bool = False stream: bool = False + profile: bool = False + profile_steps: int = 3 + profile_by_stage: bool = False @staticmethod def add_cli_args(parser: argparse.ArgumentParser): @@ -51,6 +58,11 @@ class BenchArgs: parser.add_argument("--image", action="store_true") parser.add_argument("--many-images", action="store_true") parser.add_argument("--stream", action="store_true") + parser.add_argument("--profile", action="store_true") + parser.add_argument( + "--profile-steps", type=int, default=BenchArgs.profile_steps + ) + parser.add_argument("--profile-by-stage", action="store_true") @classmethod def from_cli_args(cls, args: argparse.Namespace): @@ -59,6 +71,8 @@ class BenchArgs: def send_one_prompt(args): + base_url = f"http://{args.host}:{args.port}" + if args.image: args.prompt = ( "Human: Describe this image in a very short sentence.\n\nAssistant:" @@ -108,8 +122,20 @@ def send_one_prompt(args): "stream": args.stream, } + # Run profiler if requested + if args.profile: + print(f"Running profiler with {args.profile_steps} steps...") + run_profile( + base_url, + args.profile_steps, + ["CPU", "GPU"], + None, + None, + args.profile_by_stage, + ) + response = requests.post( - f"http://{args.host}:{args.port}/generate", + f"{base_url}/generate", json=json_data, stream=args.stream, ) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index edbcdefd7..95a842684 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -126,7 +126,7 @@ def is_in_ci(): def is_in_amd_ci(): """Return whether it is in an AMD CI runner.""" - return get_bool_env_var("SGLANG_AMD_CI") + return get_bool_env_var("SGLANG_IS_IN_CI_AMD") def _use_cached_default_models(model_repo: str): diff --git a/scripts/ci/amd_ci_exec.sh b/scripts/ci/amd_ci_exec.sh index 3bd940eb1..6cfaa35bc 100755 --- a/scripts/ci/amd_ci_exec.sh +++ b/scripts/ci/amd_ci_exec.sh @@ -15,7 +15,7 @@ fi WORKDIR="/sglang-checkout/test/srt" declare -A ENV_MAP=( - [SGLANG_AMD_CI]=1 + [SGLANG_IS_IN_CI_AMD]=1 [SGLANG_IS_IN_CI]=1 [SGLANG_USE_AITER]=1 ) diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh index 8aba6318e..517b2ca45 100755 --- a/scripts/ci/ci_install_dependency.sh +++ b/scripts/ci/ci_install_dependency.sh @@ -13,6 +13,7 @@ echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" # Clear torch compilation cache python3 -c 'import os, shutil, tempfile, getpass; cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR") or os.path.join(tempfile.gettempdir(), "torchinductor_" + getpass.getuser()); shutil.rmtree(cache_dir, ignore_errors=True)' +rm -rf /root/.cache/flashinfer # Install apt packages apt install -y git libnuma-dev diff --git a/test/srt/test_original_logprobs.py b/test/srt/test_original_logprobs.py index ddcfe3d8e..1a50e3aa6 100644 --- a/test/srt/test_original_logprobs.py +++ b/test/srt/test_original_logprobs.py @@ -125,8 +125,8 @@ class TestOriginalLogprob(unittest.TestCase): vocab_size = self.tokenizer.vocab_size for env_val in ["True", "False"]: - with self.subTest(return_original_logprob=env_val): - os.environ["RETURN_ORIGINAL_LOGPROB"] = env_val + with self.subTest(SGLANG_RETURN_ORIGINAL_LOGPROB=env_val): + os.environ["SGLANG_RETURN_ORIGINAL_LOGPROB"] = env_val # ----- SGLang side ----- sgl_engine = sgl.Engine(