From f949ad5794c17d34fe1fb90d34143764e42cc86a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 16 Sep 2025 17:06:43 -0700 Subject: [PATCH] [Auto Sync] Update activation.py, chunk_cache.py, utils.py (20250917) (#10538) Co-authored-by: github-actions[bot] --- python/sglang/srt/layers/activation.py | 13 +++++++------ python/sglang/srt/mem_cache/chunk_cache.py | 9 ++++++++- python/sglang/srt/utils.py | 2 -- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 67bae9b52..5dc48821a 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -224,12 +224,13 @@ class XIELU(CustomOp): self._xielu_cuda_fn = self._xielu_cuda logger.warning_once(msg) except Exception as err: - logger.warning_once( - "CUDA-fused xIELU not available (%s) –" - " falling back to a Python version.\n" - "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`", - str(err), - ) + pass + # logger.warning_once( + # "CUDA-fused xIELU not available (%s) –" + # " falling back to a Python version.\n" + # "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`", + # str(err), + # ) def _xielu_python(self, x: torch.Tensor) -> torch.Tensor: alpha_p = nn.functional.softplus(self.alpha_p) diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py index 1a576bfa2..6ca8d9995 100644 --- a/python/sglang/srt/mem_cache/chunk_cache.py +++ b/python/sglang/srt/mem_cache/chunk_cache.py @@ -28,6 +28,13 @@ class ChunkCache(BasePrefixCache): self.token_to_kv_pool_allocator = token_to_kv_pool_allocator self.page_size = page_size + # NOTE (csy): this is to determine if a cache has prefix matching feature. + # Chunk cache always return True to indicate no prefix matching. + # TODO (csy): Using a prefix cache trait to replace this + @property + def disable(self): + return True + def reset(self): pass @@ -38,7 +45,7 @@ class ChunkCache(BasePrefixCache): last_host_node=None, ) - def cache_finished_req(self, req: Req): + def cache_finished_req(self, req: Req, insert: bool = True): kv_indices = self.req_to_token_pool.req_to_token[ req.req_pool_idx, # For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index e38400e3f..1c9de7b7b 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -82,11 +82,9 @@ from packaging import version as pkg_version from PIL import Image from starlette.routing import Mount from torch import nn -from torch.func import functional_call from torch.library import Library from torch.profiler import ProfilerActivity, profile, record_function from torch.utils._contextlib import _DecoratorContextManager -from triton.runtime.cache import FileCacheManager from typing_extensions import Literal from sglang.srt.metrics.func_timer import enable_func_timer