diff --git a/docs/README.md b/docs/README.md index 29afab90a..67c3ad194 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,4 +1,5 @@ # SGLang Documentation +This is the documentation repository for SGLang. It is auto-generated from https://github.com/sgl-project/sglang/tree/main/docs. ## Build the documentation website diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index a1e43e4cd..9662f61a8 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -407,7 +407,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput: def get_model(pretrained_model_name_or_path: str) -> str: - if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true": + if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true": import huggingface_hub.constants from modelscope import snapshot_download diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index b72134a56..fb391e627 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -7,6 +7,7 @@ FlashInfer is faster and Triton is easier to customize. Each backend supports two operators: extend (i.e. prefill with cached prefix) and decode. """ +import os from enum import Enum, auto from typing import TYPE_CHECKING, List @@ -45,13 +46,19 @@ class FlashInferAttnBackend(AttentionBackend): super().__init__() # Parse constants - if not _grouped_size_compiled_for_decode_kernels( - model_runner.model_config.num_attention_heads // model_runner.tp_size, - model_runner.model_config.get_num_kv_heads(model_runner.tp_size), - ): - self.decode_use_tensor_cores = True + if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ: + self.decode_use_tensor_cores = ( + os.environ["SGLANG_FLASHINFER_USE_TENSOR_CORE"].lower() == "true" + ) else: - self.decode_use_tensor_cores = False + if not _grouped_size_compiled_for_decode_kernels( + model_runner.model_config.num_attention_heads // model_runner.tp_size, + model_runner.model_config.get_num_kv_heads(model_runner.tp_size), + ): + self.decode_use_tensor_cores = True + else: + self.decode_use_tensor_cores = False + self.max_context_len = model_runner.model_config.context_len assert not ( diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 796164297..9ca9b7c64 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -81,7 +81,7 @@ from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) # Test retract decode -test_retract = os.getenv("SGLANG_TEST_RETRACT", "false") == "true" +test_retract = os.getenv("SGLANG_TEST_RETRACT", "false").lower() == "true" class Scheduler: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 8bb6a5830..e947d1a92 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -930,7 +930,7 @@ def get_nvgpu_memory_capacity(): def crash_on_warnings(): # Crash on warning if we are running CI tests - return os.getenv("SGLANG_IS_IN_CI", "false") == "true" + return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true" def get_device_name(device_id: int = 0) -> str: diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index d673d59ff..be1755bd3 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -44,7 +44,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8 def is_in_ci(): """Return whether it is in CI runner.""" - return os.getenv("SGLANG_IS_IN_CI", "false") == "true" + return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true" if is_in_ci():