Update docs (#1839)

This commit is contained in:
Lianmin Zheng
2024-10-30 02:49:08 -07:00
committed by GitHub
parent 539df95d2c
commit b548801ddb
11 changed files with 165 additions and 198 deletions

View File

@@ -29,5 +29,5 @@ if __name__ == "__main__":
parser.add_argument("--url", type=str, default="http://localhost:30000")
args = parser.parse_args()
response = requests.get(args.url + "/flush_cache")
response = requests.post(args.url + "/flush_cache")
assert response.status_code == 200

View File

@@ -124,7 +124,7 @@ class ModelRunner:
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args.chunked_prefill_size = None
server_args.mem_fraction_static *= 0.95
self.mem_fraction_static *= 0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if self.model_config.hf_config.architectures == [
"Qwen2VLForConditionalGeneration"

View File

@@ -139,7 +139,7 @@ async def get_server_args():
return dataclasses.asdict(tokenizer_manager.server_args)
@app.get("/flush_cache")
@app.post("/flush_cache")
async def flush_cache():
"""Flush the radix cache."""
tokenizer_manager.flush_cache()
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
return ret
except Exception as e:
return JSONResponse(
return ORJSONResponse(
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
)

View File

@@ -19,7 +19,6 @@ from typing import Optional, Union
import numpy as np
import requests
import torch
from IPython.display import HTML, display
from tqdm import tqdm
@@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
headers={"Authorization": "Bearer None"},
)
if response.status_code == 200:
time.sleep(5)
print_highlight(
"""\n
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
"""
Server and notebook outputs are combined for clarity.
Typically, the server runs in a separate terminal.
Server output is gray; notebook output is highlighted.
"""
)
break
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
def terminate_process(process):
"""Safely terminate a process and clean up GPU memory.
Args:
process: subprocess.Popen object to terminate
"""
try:
process.terminate()
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
if os.name != "nt":
try:
pgid = os.getpgid(process.pid)
os.killpg(pgid, signal.SIGTERM)
time.sleep(1)
if process.poll() is None:
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
pass
else:
process.kill()
process.wait()
except Exception as e:
print(f"Warning: {e}")
finally:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
time.sleep(2)
from sglang.srt.utils import kill_child_process
kill_child_process(process.pid, include_self=True)
def print_highlight(html_content: str):