Update docs (#1839)

2024-10-30 02:49:08 -07:00
parent 539df95d2c
commit b548801ddb
11 changed files with 165 additions and 198 deletions
--- a/python/sglang/srt/mem_cache/flush_cache.py
+++ b/python/sglang/srt/mem_cache/flush_cache.py
@@ -29,5 +29,5 @@ if __name__ == "__main__":
    parser.add_argument("--url", type=str, default="http://localhost:30000")
    args = parser.parse_args()

-    response = requests.get(args.url + "/flush_cache")
+    response = requests.post(args.url + "/flush_cache")
    assert response.status_code == 200
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -124,7 +124,7 @@ class ModelRunner:
                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
            )
            server_args.chunked_prefill_size = None
-            server_args.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.95
            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
            if self.model_config.hf_config.architectures == [
                "Qwen2VLForConditionalGeneration"
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -139,7 +139,7 @@ async def get_server_args():
    return dataclasses.asdict(tokenizer_manager.server_args)


-@app.get("/flush_cache")
+@app.post("/flush_cache")
 async def flush_cache():
    """Flush the radix cache."""
    tokenizer_manager.flush_cache()
@@ -180,7 +180,7 @@ async def get_memory_pool_size():

        return ret
    except Exception as e:
-        return JSONResponse(
+        return ORJSONResponse(
            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
        )