Fix GPU OOM (#6564)

Co-authored-by: michael <michael.zhang@amd.com>
2025-05-25 07:38:39 +08:00
parent 24c035f2e3
commit 7a5e6ce1cb
8 changed files with 15 additions and 16 deletions
--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -506,6 +506,7 @@ class AiterIndicesUpdaterPrefill:
                spec_info.generate_attn_arg_prefill(
                    req_pool_indices,
                    paged_kernel_lens,
+                    None,
                    self.req_to_token,
                )
            )
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -412,6 +412,10 @@ class ModelRunner:
        if not server_args.disable_chunked_prefix_cache:
            logger.info("Chunked prefix cache is turned on.")

+        if server_args.attention_backend == "aiter":
+            if self.model_config.context_len > 8192:
+                self.mem_fraction_static *= 0.85
+
    def init_torch_distributed(self):
        logger.info("Init torch distributed begin.")