Update docs (#1839)

2024-10-30 02:49:08 -07:00
parent 539df95d2c
commit b548801ddb
11 changed files with 165 additions and 198 deletions
--- a/python/sglang/srt/mem_cache/flush_cache.py
+++ b/python/sglang/srt/mem_cache/flush_cache.py
@@ -29,5 +29,5 @@ if __name__ == "__main__":
    parser.add_argument("--url", type=str, default="http://localhost:30000")
    args = parser.parse_args()

-    response = requests.get(args.url + "/flush_cache")
+    response = requests.post(args.url + "/flush_cache")
    assert response.status_code == 200
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -124,7 +124,7 @@ class ModelRunner:
                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
            )
            server_args.chunked_prefill_size = None
-            server_args.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.95
            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
            if self.model_config.hf_config.architectures == [
                "Qwen2VLForConditionalGeneration"
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -139,7 +139,7 @@ async def get_server_args():
    return dataclasses.asdict(tokenizer_manager.server_args)


-@app.get("/flush_cache")
+@app.post("/flush_cache")
 async def flush_cache():
    """Flush the radix cache."""
    tokenizer_manager.flush_cache()
@@ -180,7 +180,7 @@ async def get_memory_pool_size():

        return ret
    except Exception as e:
-        return JSONResponse(
+        return ORJSONResponse(
            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
        )

--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -19,7 +19,6 @@ from typing import Optional, Union

 import numpy as np
 import requests
-import torch
 from IPython.display import HTML, display
 from tqdm import tqdm

@@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
+                time.sleep(5)
                print_highlight(
+                    """\n
+                    NOTE: Typically, the server runs in a separate terminal.
+                    In this notebook, we run the server and notebook code together, so their outputs are combined.
+                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
                    """
-                            Server and notebook outputs are combined for clarity.
-                            
-                            Typically, the server runs in a separate terminal.
-                            
-                            Server output is gray; notebook output is highlighted.
-                            """
                )
                break

@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:


 def terminate_process(process):
-    """Safely terminate a process and clean up GPU memory.
-
-    Args:
-        process: subprocess.Popen object to terminate
-    """
-    try:
-        process.terminate()
-        try:
-            process.wait(timeout=5)
-        except subprocess.TimeoutExpired:
-            if os.name != "nt":
-                try:
-                    pgid = os.getpgid(process.pid)
-                    os.killpg(pgid, signal.SIGTERM)
-                    time.sleep(1)
-                    if process.poll() is None:
-                        os.killpg(pgid, signal.SIGKILL)
-                except ProcessLookupError:
-                    pass
-            else:
-                process.kill()
-            process.wait()
-    except Exception as e:
-        print(f"Warning: {e}")
-    finally:
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-        time.sleep(2)
+    from sglang.srt.utils import kill_child_process
+    kill_child_process(process.pid, include_self=True)


 def print_highlight(html_content: str):