[Auto Sync] Update activation.py, chunk_cache.py, utils.py (20250917) (#10538)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@@ -224,12 +224,13 @@ class XIELU(CustomOp):
|
|||||||
self._xielu_cuda_fn = self._xielu_cuda
|
self._xielu_cuda_fn = self._xielu_cuda
|
||||||
logger.warning_once(msg)
|
logger.warning_once(msg)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.warning_once(
|
pass
|
||||||
"CUDA-fused xIELU not available (%s) –"
|
# logger.warning_once(
|
||||||
" falling back to a Python version.\n"
|
# "CUDA-fused xIELU not available (%s) –"
|
||||||
"For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
|
# " falling back to a Python version.\n"
|
||||||
str(err),
|
# "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
|
||||||
)
|
# str(err),
|
||||||
|
# )
|
||||||
|
|
||||||
def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
|
def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
alpha_p = nn.functional.softplus(self.alpha_p)
|
alpha_p = nn.functional.softplus(self.alpha_p)
|
||||||
|
|||||||
@@ -28,6 +28,13 @@ class ChunkCache(BasePrefixCache):
|
|||||||
self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
|
self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
|
||||||
self.page_size = page_size
|
self.page_size = page_size
|
||||||
|
|
||||||
|
# NOTE (csy): this is to determine if a cache has prefix matching feature.
|
||||||
|
# Chunk cache always return True to indicate no prefix matching.
|
||||||
|
# TODO (csy): Using a prefix cache trait to replace this
|
||||||
|
@property
|
||||||
|
def disable(self):
|
||||||
|
return True
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -38,7 +45,7 @@ class ChunkCache(BasePrefixCache):
|
|||||||
last_host_node=None,
|
last_host_node=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def cache_finished_req(self, req: Req):
|
def cache_finished_req(self, req: Req, insert: bool = True):
|
||||||
kv_indices = self.req_to_token_pool.req_to_token[
|
kv_indices = self.req_to_token_pool.req_to_token[
|
||||||
req.req_pool_idx,
|
req.req_pool_idx,
|
||||||
# For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
|
# For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
|
||||||
|
|||||||
@@ -82,11 +82,9 @@ from packaging import version as pkg_version
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from starlette.routing import Mount
|
from starlette.routing import Mount
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.func import functional_call
|
|
||||||
from torch.library import Library
|
from torch.library import Library
|
||||||
from torch.profiler import ProfilerActivity, profile, record_function
|
from torch.profiler import ProfilerActivity, profile, record_function
|
||||||
from torch.utils._contextlib import _DecoratorContextManager
|
from torch.utils._contextlib import _DecoratorContextManager
|
||||||
from triton.runtime.cache import FileCacheManager
|
|
||||||
from typing_extensions import Literal
|
from typing_extensions import Literal
|
||||||
|
|
||||||
from sglang.srt.metrics.func_timer import enable_func_timer
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
||||||
|
|||||||
Reference in New Issue
Block a user