Update docs (#1839)
This commit is contained in:
@@ -29,5 +29,5 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--url", type=str, default="http://localhost:30000")
|
||||
args = parser.parse_args()
|
||||
|
||||
response = requests.get(args.url + "/flush_cache")
|
||||
response = requests.post(args.url + "/flush_cache")
|
||||
assert response.status_code == 200
|
||||
|
||||
@@ -124,7 +124,7 @@ class ModelRunner:
|
||||
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
|
||||
)
|
||||
server_args.chunked_prefill_size = None
|
||||
server_args.mem_fraction_static *= 0.95
|
||||
self.mem_fraction_static *= 0.95
|
||||
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
|
||||
if self.model_config.hf_config.architectures == [
|
||||
"Qwen2VLForConditionalGeneration"
|
||||
|
||||
@@ -139,7 +139,7 @@ async def get_server_args():
|
||||
return dataclasses.asdict(tokenizer_manager.server_args)
|
||||
|
||||
|
||||
@app.get("/flush_cache")
|
||||
@app.post("/flush_cache")
|
||||
async def flush_cache():
|
||||
"""Flush the radix cache."""
|
||||
tokenizer_manager.flush_cache()
|
||||
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
|
||||
|
||||
return ret
|
||||
except Exception as e:
|
||||
return JSONResponse(
|
||||
return ORJSONResponse(
|
||||
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
||||
)
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@ from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
import torch
|
||||
from IPython.display import HTML, display
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
||||
headers={"Authorization": "Bearer None"},
|
||||
)
|
||||
if response.status_code == 200:
|
||||
time.sleep(5)
|
||||
print_highlight(
|
||||
"""\n
|
||||
NOTE: Typically, the server runs in a separate terminal.
|
||||
In this notebook, we run the server and notebook code together, so their outputs are combined.
|
||||
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
|
||||
"""
|
||||
Server and notebook outputs are combined for clarity.
|
||||
|
||||
Typically, the server runs in a separate terminal.
|
||||
|
||||
Server output is gray; notebook output is highlighted.
|
||||
"""
|
||||
)
|
||||
break
|
||||
|
||||
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
||||
|
||||
|
||||
def terminate_process(process):
|
||||
"""Safely terminate a process and clean up GPU memory.
|
||||
|
||||
Args:
|
||||
process: subprocess.Popen object to terminate
|
||||
"""
|
||||
try:
|
||||
process.terminate()
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
if os.name != "nt":
|
||||
try:
|
||||
pgid = os.getpgid(process.pid)
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
time.sleep(1)
|
||||
if process.poll() is None:
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
else:
|
||||
process.kill()
|
||||
process.wait()
|
||||
except Exception as e:
|
||||
print(f"Warning: {e}")
|
||||
finally:
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.ipc_collect()
|
||||
time.sleep(2)
|
||||
from sglang.srt.utils import kill_child_process
|
||||
kill_child_process(process.pid, include_self=True)
|
||||
|
||||
|
||||
def print_highlight(html_content: str):
|
||||
|
||||
Reference in New Issue
Block a user