diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43cd18118..265339ec4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,6 @@ repos: rev: 24.10.0 hooks: - id: black - additional_dependencies: ['.[jupyter]'] - types: [python, jupyter] - types_or: [python, jupyter] + types: [python] + - id: black-jupyter + types: [jupyter] diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 04cee0776..7a58c00a5 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -34,10 +34,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:08.536886Z", - "iopub.status.busy": "2024-11-05T05:08:08.536763Z", - "iopub.status.idle": "2024-11-05T05:08:34.725831Z", - "shell.execute_reply": "2024-11-05T05:08:34.725316Z" + "iopub.execute_input": "2024-11-07T18:44:42.063503Z", + "iopub.status.busy": "2024-11-07T18:44:42.063379Z", + "iopub.status.idle": "2024-11-07T18:45:07.255300Z", + "shell.execute_reply": "2024-11-07T18:45:07.254547Z" } }, "outputs": [], @@ -73,10 +73,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:34.727530Z", - "iopub.status.busy": "2024-11-05T05:08:34.727333Z", - "iopub.status.idle": "2024-11-05T05:08:35.359784Z", - "shell.execute_reply": "2024-11-05T05:08:35.359090Z" + "iopub.execute_input": "2024-11-07T18:45:07.258292Z", + "iopub.status.busy": "2024-11-07T18:45:07.257710Z", + "iopub.status.idle": "2024-11-07T18:45:07.611559Z", + "shell.execute_reply": "2024-11-07T18:45:07.610842Z" } }, "outputs": [], @@ -101,10 +101,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.362286Z", - "iopub.status.busy": "2024-11-05T05:08:35.362140Z", - "iopub.status.idle": "2024-11-05T05:08:35.368711Z", - "shell.execute_reply": "2024-11-05T05:08:35.368220Z" + "iopub.execute_input": "2024-11-07T18:45:07.613911Z", + "iopub.status.busy": "2024-11-07T18:45:07.613746Z", + "iopub.status.idle": "2024-11-07T18:45:07.620286Z", + "shell.execute_reply": "2024-11-07T18:45:07.619779Z" } }, "outputs": [], @@ -132,10 +132,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.371313Z", - "iopub.status.busy": "2024-11-05T05:08:35.370877Z", - "iopub.status.idle": "2024-11-05T05:08:35.376712Z", - "shell.execute_reply": "2024-11-05T05:08:35.376230Z" + "iopub.execute_input": "2024-11-07T18:45:07.622407Z", + "iopub.status.busy": "2024-11-07T18:45:07.622267Z", + "iopub.status.idle": "2024-11-07T18:45:07.628290Z", + "shell.execute_reply": "2024-11-07T18:45:07.627793Z" } }, "outputs": [], @@ -164,10 +164,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.378982Z", - "iopub.status.busy": "2024-11-05T05:08:35.378597Z", - "iopub.status.idle": "2024-11-05T05:08:35.391820Z", - "shell.execute_reply": "2024-11-05T05:08:35.391336Z" + "iopub.execute_input": "2024-11-07T18:45:07.630585Z", + "iopub.status.busy": "2024-11-07T18:45:07.630235Z", + "iopub.status.idle": "2024-11-07T18:45:07.643498Z", + "shell.execute_reply": "2024-11-07T18:45:07.643007Z" } }, "outputs": [], @@ -183,10 +183,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.393748Z", - "iopub.status.busy": "2024-11-05T05:08:35.393606Z", - "iopub.status.idle": "2024-11-05T05:08:35.398645Z", - "shell.execute_reply": "2024-11-05T05:08:35.398145Z" + "iopub.execute_input": "2024-11-07T18:45:07.645336Z", + "iopub.status.busy": "2024-11-07T18:45:07.645196Z", + "iopub.status.idle": "2024-11-07T18:45:07.650363Z", + "shell.execute_reply": "2024-11-07T18:45:07.649837Z" } }, "outputs": [], @@ -211,10 +211,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.400683Z", - "iopub.status.busy": "2024-11-05T05:08:35.400419Z", - "iopub.status.idle": "2024-11-05T05:08:35.406146Z", - "shell.execute_reply": "2024-11-05T05:08:35.405661Z" + "iopub.execute_input": "2024-11-07T18:45:07.652212Z", + "iopub.status.busy": "2024-11-07T18:45:07.652076Z", + "iopub.status.idle": "2024-11-07T18:45:07.658633Z", + "shell.execute_reply": "2024-11-07T18:45:07.658119Z" } }, "outputs": [], @@ -241,10 +241,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.408176Z", - "iopub.status.busy": "2024-11-05T05:08:35.407884Z", - "iopub.status.idle": "2024-11-05T05:08:35.413587Z", - "shell.execute_reply": "2024-11-05T05:08:35.413108Z" + "iopub.execute_input": "2024-11-07T18:45:07.660468Z", + "iopub.status.busy": "2024-11-07T18:45:07.660325Z", + "iopub.status.idle": "2024-11-07T18:45:07.666476Z", + "shell.execute_reply": "2024-11-07T18:45:07.665984Z" } }, "outputs": [], @@ -271,10 +271,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:35.416090Z", - "iopub.status.busy": "2024-11-05T05:08:35.415793Z", - "iopub.status.idle": "2024-11-05T05:08:36.552549Z", - "shell.execute_reply": "2024-11-05T05:08:36.551870Z" + "iopub.execute_input": "2024-11-07T18:45:07.668242Z", + "iopub.status.busy": "2024-11-07T18:45:07.668108Z", + "iopub.status.idle": "2024-11-07T18:45:08.725709Z", + "shell.execute_reply": "2024-11-07T18:45:08.725021Z" } }, "outputs": [], @@ -296,10 +296,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:36.554823Z", - "iopub.status.busy": "2024-11-05T05:08:36.554680Z", - "iopub.status.idle": "2024-11-05T05:08:38.053945Z", - "shell.execute_reply": "2024-11-05T05:08:38.053034Z" + "iopub.execute_input": "2024-11-07T18:45:08.727865Z", + "iopub.status.busy": "2024-11-07T18:45:08.727721Z", + "iopub.status.idle": "2024-11-07T18:45:11.165841Z", + "shell.execute_reply": "2024-11-07T18:45:11.165282Z" } }, "outputs": [], @@ -335,10 +335,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:08:38.056783Z", - "iopub.status.busy": "2024-11-05T05:08:38.056497Z", - "iopub.status.idle": "2024-11-05T05:09:04.436030Z", - "shell.execute_reply": "2024-11-05T05:09:04.435311Z" + "iopub.execute_input": "2024-11-07T18:45:11.167853Z", + "iopub.status.busy": "2024-11-07T18:45:11.167711Z", + "iopub.status.idle": "2024-11-07T18:45:39.542988Z", + "shell.execute_reply": "2024-11-07T18:45:39.542135Z" } }, "outputs": [], @@ -360,10 +360,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:04.438987Z", - "iopub.status.busy": "2024-11-05T05:09:04.438568Z", - "iopub.status.idle": "2024-11-05T05:09:04.485291Z", - "shell.execute_reply": "2024-11-05T05:09:04.484829Z" + "iopub.execute_input": "2024-11-07T18:45:39.545416Z", + "iopub.status.busy": "2024-11-07T18:45:39.545005Z", + "iopub.status.idle": "2024-11-07T18:45:39.588793Z", + "shell.execute_reply": "2024-11-07T18:45:39.588054Z" } }, "outputs": [], @@ -392,10 +392,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:04.487191Z", - "iopub.status.busy": "2024-11-05T05:09:04.486929Z", - "iopub.status.idle": "2024-11-05T05:09:25.553481Z", - "shell.execute_reply": "2024-11-05T05:09:25.552747Z" + "iopub.execute_input": "2024-11-07T18:45:39.590729Z", + "iopub.status.busy": "2024-11-07T18:45:39.590446Z", + "iopub.status.idle": "2024-11-07T18:45:59.660376Z", + "shell.execute_reply": "2024-11-07T18:45:59.659992Z" } }, "outputs": [], @@ -419,10 +419,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:25.555813Z", - "iopub.status.busy": "2024-11-05T05:09:25.555666Z", - "iopub.status.idle": "2024-11-05T05:09:26.354372Z", - "shell.execute_reply": "2024-11-05T05:09:26.353693Z" + "iopub.execute_input": "2024-11-07T18:45:59.661779Z", + "iopub.status.busy": "2024-11-07T18:45:59.661641Z", + "iopub.status.idle": "2024-11-07T18:46:00.475726Z", + "shell.execute_reply": "2024-11-07T18:46:00.475269Z" } }, "outputs": [], @@ -445,10 +445,7 @@ "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n", "\n", "url = \"http://localhost:30030/classify\"\n", - "data = {\n", - " \"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \n", - " \"text\": prompts\n", - "}\n", + "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n", "\n", "responses = requests.post(url, json=data).json()\n", "for response in responses:\n", @@ -460,10 +457,10 @@ "execution_count": 15, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:26.356532Z", - "iopub.status.busy": "2024-11-05T05:09:26.356327Z", - "iopub.status.idle": "2024-11-05T05:09:26.396590Z", - "shell.execute_reply": "2024-11-05T05:09:26.395914Z" + "iopub.execute_input": "2024-11-07T18:46:00.477283Z", + "iopub.status.busy": "2024-11-07T18:46:00.477025Z", + "iopub.status.idle": "2024-11-07T18:46:00.525758Z", + "shell.execute_reply": "2024-11-07T18:46:00.525236Z" } }, "outputs": [], diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 48f280004..f2c507640 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -35,10 +35,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:21:27.503026Z", - "iopub.status.busy": "2024-11-05T05:21:27.502741Z", - "iopub.status.idle": "2024-11-05T05:21:49.554631Z", - "shell.execute_reply": "2024-11-05T05:21:49.553690Z" + "iopub.execute_input": "2024-11-07T18:46:04.789536Z", + "iopub.status.busy": "2024-11-07T18:46:04.789418Z", + "iopub.status.idle": "2024-11-07T18:46:27.038169Z", + "shell.execute_reply": "2024-11-07T18:46:27.037540Z" } }, "outputs": [], @@ -64,10 +64,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:21:49.558275Z", - "iopub.status.busy": "2024-11-05T05:21:49.558110Z", - "iopub.status.idle": "2024-11-05T05:21:52.717287Z", - "shell.execute_reply": "2024-11-05T05:21:52.716842Z" + "iopub.execute_input": "2024-11-07T18:46:27.040005Z", + "iopub.status.busy": "2024-11-07T18:46:27.039872Z", + "iopub.status.idle": "2024-11-07T18:46:30.203840Z", + "shell.execute_reply": "2024-11-07T18:46:30.203368Z" } }, "outputs": [], @@ -99,10 +99,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:21:52.721738Z", - "iopub.status.busy": "2024-11-05T05:21:52.720908Z", - "iopub.status.idle": "2024-11-05T05:22:01.770341Z", - "shell.execute_reply": "2024-11-05T05:22:01.769510Z" + "iopub.execute_input": "2024-11-07T18:46:30.205880Z", + "iopub.status.busy": "2024-11-07T18:46:30.205719Z", + "iopub.status.idle": "2024-11-07T18:46:39.256561Z", + "shell.execute_reply": "2024-11-07T18:46:39.255880Z" } }, "outputs": [], @@ -137,10 +137,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:01.772662Z", - "iopub.status.busy": "2024-11-05T05:22:01.772377Z", - "iopub.status.idle": "2024-11-05T05:22:04.897499Z", - "shell.execute_reply": "2024-11-05T05:22:04.896867Z" + "iopub.execute_input": "2024-11-07T18:46:39.259464Z", + "iopub.status.busy": "2024-11-07T18:46:39.259309Z", + "iopub.status.idle": "2024-11-07T18:46:42.384955Z", + "shell.execute_reply": "2024-11-07T18:46:42.384378Z" } }, "outputs": [], @@ -179,10 +179,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:04.899754Z", - "iopub.status.busy": "2024-11-05T05:22:04.899478Z", - "iopub.status.idle": "2024-11-05T05:22:13.970245Z", - "shell.execute_reply": "2024-11-05T05:22:13.969779Z" + "iopub.execute_input": "2024-11-07T18:46:42.387431Z", + "iopub.status.busy": "2024-11-07T18:46:42.387279Z", + "iopub.status.idle": "2024-11-07T18:46:51.448572Z", + "shell.execute_reply": "2024-11-07T18:46:51.447781Z" } }, "outputs": [], @@ -216,10 +216,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:13.972039Z", - "iopub.status.busy": "2024-11-05T05:22:13.971846Z", - "iopub.status.idle": "2024-11-05T05:22:14.027421Z", - "shell.execute_reply": "2024-11-05T05:22:14.027003Z" + "iopub.execute_input": "2024-11-07T18:46:51.451177Z", + "iopub.status.busy": "2024-11-07T18:46:51.450952Z", + "iopub.status.idle": "2024-11-07T18:46:51.497530Z", + "shell.execute_reply": "2024-11-07T18:46:51.496850Z" } }, "outputs": [], diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 1dfa53129..776af13f8 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -39,10 +39,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:30.637832Z", - "iopub.status.busy": "2024-11-05T05:09:30.637709Z", - "iopub.status.idle": "2024-11-05T05:09:58.830158Z", - "shell.execute_reply": "2024-11-05T05:09:58.829395Z" + "iopub.execute_input": "2024-11-07T18:46:54.813876Z", + "iopub.status.busy": "2024-11-07T18:46:54.813741Z", + "iopub.status.idle": "2024-11-07T18:47:24.015527Z", + "shell.execute_reply": "2024-11-07T18:47:24.014987Z" } }, "outputs": [], @@ -79,10 +79,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:09:58.833008Z", - "iopub.status.busy": "2024-11-05T05:09:58.832805Z", - "iopub.status.idle": "2024-11-05T05:10:00.187146Z", - "shell.execute_reply": "2024-11-05T05:10:00.186657Z" + "iopub.execute_input": "2024-11-07T18:47:24.018153Z", + "iopub.status.busy": "2024-11-07T18:47:24.017755Z", + "iopub.status.idle": "2024-11-07T18:47:25.374821Z", + "shell.execute_reply": "2024-11-07T18:47:25.374397Z" } }, "outputs": [], @@ -119,10 +119,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:00.189444Z", - "iopub.status.busy": "2024-11-05T05:10:00.189289Z", - "iopub.status.idle": "2024-11-05T05:10:03.291891Z", - "shell.execute_reply": "2024-11-05T05:10:03.291173Z" + "iopub.execute_input": "2024-11-07T18:47:25.376617Z", + "iopub.status.busy": "2024-11-07T18:47:25.376495Z", + "iopub.status.idle": "2024-11-07T18:47:28.482537Z", + "shell.execute_reply": "2024-11-07T18:47:28.482125Z" } }, "outputs": [], @@ -165,10 +165,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:03.294389Z", - "iopub.status.busy": "2024-11-05T05:10:03.294237Z", - "iopub.status.idle": "2024-11-05T05:10:03.469357Z", - "shell.execute_reply": "2024-11-05T05:10:03.468661Z" + "iopub.execute_input": "2024-11-07T18:47:28.484819Z", + "iopub.status.busy": "2024-11-07T18:47:28.484673Z", + "iopub.status.idle": "2024-11-07T18:47:28.659814Z", + "shell.execute_reply": "2024-11-07T18:47:28.659435Z" } }, "outputs": [], @@ -198,10 +198,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:03.471573Z", - "iopub.status.busy": "2024-11-05T05:10:03.471430Z", - "iopub.status.idle": "2024-11-05T05:10:04.977081Z", - "shell.execute_reply": "2024-11-05T05:10:04.976391Z" + "iopub.execute_input": "2024-11-07T18:47:28.661844Z", + "iopub.status.busy": "2024-11-07T18:47:28.661710Z", + "iopub.status.idle": "2024-11-07T18:47:30.168922Z", + "shell.execute_reply": "2024-11-07T18:47:30.168600Z" } }, "outputs": [], @@ -234,10 +234,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:04.979428Z", - "iopub.status.busy": "2024-11-05T05:10:04.979272Z", - "iopub.status.idle": "2024-11-05T05:10:08.568761Z", - "shell.execute_reply": "2024-11-05T05:10:08.568355Z" + "iopub.execute_input": "2024-11-07T18:47:30.171319Z", + "iopub.status.busy": "2024-11-07T18:47:30.171176Z", + "iopub.status.idle": "2024-11-07T18:47:33.760113Z", + "shell.execute_reply": "2024-11-07T18:47:33.759713Z" } }, "outputs": [], @@ -273,10 +273,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:08.571102Z", - "iopub.status.busy": "2024-11-05T05:10:08.570964Z", - "iopub.status.idle": "2024-11-05T05:10:23.214087Z", - "shell.execute_reply": "2024-11-05T05:10:23.213664Z" + "iopub.execute_input": "2024-11-07T18:47:33.762729Z", + "iopub.status.busy": "2024-11-07T18:47:33.762590Z", + "iopub.status.idle": "2024-11-07T18:47:34.255316Z", + "shell.execute_reply": "2024-11-07T18:47:34.254907Z" } }, "outputs": [], @@ -297,7 +297,10 @@ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " messages=[\n", - " {\"role\": \"user\", \"content\": \"Give me the information of the capital of France in the JSON format.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Give me the information of the capital of France in the JSON format.\",\n", + " },\n", " ],\n", " temperature=0,\n", " max_tokens=128,\n", @@ -322,10 +325,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:23.216229Z", - "iopub.status.busy": "2024-11-05T05:10:23.216076Z", - "iopub.status.idle": "2024-11-05T05:10:23.884236Z", - "shell.execute_reply": "2024-11-05T05:10:23.883897Z" + "iopub.execute_input": "2024-11-07T18:47:34.257393Z", + "iopub.status.busy": "2024-11-07T18:47:34.257246Z", + "iopub.status.idle": "2024-11-07T18:47:34.413506Z", + "shell.execute_reply": "2024-11-07T18:47:34.413172Z" } }, "outputs": [], @@ -365,10 +368,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:23.886276Z", - "iopub.status.busy": "2024-11-05T05:10:23.886136Z", - "iopub.status.idle": "2024-11-05T05:10:23.905880Z", - "shell.execute_reply": "2024-11-05T05:10:23.905529Z" + "iopub.execute_input": "2024-11-07T18:47:34.414816Z", + "iopub.status.busy": "2024-11-07T18:47:34.414541Z", + "iopub.status.idle": "2024-11-07T18:47:34.431341Z", + "shell.execute_reply": "2024-11-07T18:47:34.431081Z" } }, "outputs": [], @@ -427,10 +430,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:23.907468Z", - "iopub.status.busy": "2024-11-05T05:10:23.907247Z", - "iopub.status.idle": "2024-11-05T05:10:26.920212Z", - "shell.execute_reply": "2024-11-05T05:10:26.919865Z" + "iopub.execute_input": "2024-11-07T18:47:34.432325Z", + "iopub.status.busy": "2024-11-07T18:47:34.432208Z", + "iopub.status.idle": "2024-11-07T18:47:37.444337Z", + "shell.execute_reply": "2024-11-07T18:47:37.444000Z" } }, "outputs": [], @@ -482,10 +485,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:26.922675Z", - "iopub.status.busy": "2024-11-05T05:10:26.922413Z", - "iopub.status.idle": "2024-11-05T05:10:51.961703Z", - "shell.execute_reply": "2024-11-05T05:10:51.960846Z" + "iopub.execute_input": "2024-11-07T18:47:37.445894Z", + "iopub.status.busy": "2024-11-07T18:47:37.445744Z", + "iopub.status.idle": "2024-11-07T18:48:02.482532Z", + "shell.execute_reply": "2024-11-07T18:48:02.482042Z" } }, "outputs": [], @@ -565,10 +568,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:10:51.964749Z", - "iopub.status.busy": "2024-11-05T05:10:51.964215Z", - "iopub.status.idle": "2024-11-05T05:11:05.023450Z", - "shell.execute_reply": "2024-11-05T05:11:05.023101Z" + "iopub.execute_input": "2024-11-07T18:48:02.485206Z", + "iopub.status.busy": "2024-11-07T18:48:02.485064Z", + "iopub.status.idle": "2024-11-07T18:48:15.521489Z", + "shell.execute_reply": "2024-11-07T18:48:15.521156Z" } }, "outputs": [], @@ -660,10 +663,10 @@ "execution_count": 13, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:05.024877Z", - "iopub.status.busy": "2024-11-05T05:11:05.024561Z", - "iopub.status.idle": "2024-11-05T05:11:06.358695Z", - "shell.execute_reply": "2024-11-05T05:11:06.357635Z" + "iopub.execute_input": "2024-11-07T18:48:15.522794Z", + "iopub.status.busy": "2024-11-07T18:48:15.522657Z", + "iopub.status.idle": "2024-11-07T18:48:16.875740Z", + "shell.execute_reply": "2024-11-07T18:48:16.874847Z" } }, "outputs": [], diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index a221c16eb..078024f01 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -35,10 +35,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:17.227174Z", - "iopub.status.busy": "2024-11-05T05:22:17.226952Z", - "iopub.status.idle": "2024-11-05T05:22:42.445791Z", - "shell.execute_reply": "2024-11-05T05:22:42.444980Z" + "iopub.execute_input": "2024-11-07T18:48:21.128020Z", + "iopub.status.busy": "2024-11-07T18:48:21.127898Z", + "iopub.status.idle": "2024-11-07T18:48:45.310371Z", + "shell.execute_reply": "2024-11-07T18:48:45.309469Z" } }, "outputs": [], @@ -72,10 +72,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.448147Z", - "iopub.status.busy": "2024-11-05T05:22:42.447775Z", - "iopub.status.idle": "2024-11-05T05:22:42.495311Z", - "shell.execute_reply": "2024-11-05T05:22:42.495027Z" + "iopub.execute_input": "2024-11-07T18:48:45.313506Z", + "iopub.status.busy": "2024-11-07T18:48:45.313123Z", + "iopub.status.idle": "2024-11-07T18:48:45.364918Z", + "shell.execute_reply": "2024-11-07T18:48:45.364155Z" } }, "outputs": [], @@ -106,10 +106,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.496666Z", - "iopub.status.busy": "2024-11-05T05:22:42.496524Z", - "iopub.status.idle": "2024-11-05T05:22:42.540687Z", - "shell.execute_reply": "2024-11-05T05:22:42.540060Z" + "iopub.execute_input": "2024-11-07T18:48:45.367776Z", + "iopub.status.busy": "2024-11-07T18:48:45.367490Z", + "iopub.status.idle": "2024-11-07T18:48:45.411386Z", + "shell.execute_reply": "2024-11-07T18:48:45.411134Z" } }, "outputs": [], @@ -140,10 +140,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.542551Z", - "iopub.status.busy": "2024-11-05T05:22:42.542282Z", - "iopub.status.idle": "2024-11-05T05:22:42.928542Z", - "shell.execute_reply": "2024-11-05T05:22:42.928181Z" + "iopub.execute_input": "2024-11-07T18:48:45.412462Z", + "iopub.status.busy": "2024-11-07T18:48:45.412351Z", + "iopub.status.idle": "2024-11-07T18:48:45.768796Z", + "shell.execute_reply": "2024-11-07T18:48:45.768406Z" } }, "outputs": [], @@ -176,10 +176,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:42.930093Z", - "iopub.status.busy": "2024-11-05T05:22:42.929954Z", - "iopub.status.idle": "2024-11-05T05:22:44.799945Z", - "shell.execute_reply": "2024-11-05T05:22:44.799562Z" + "iopub.execute_input": "2024-11-07T18:48:45.770227Z", + "iopub.status.busy": "2024-11-07T18:48:45.770106Z", + "iopub.status.idle": "2024-11-07T18:48:47.447065Z", + "shell.execute_reply": "2024-11-07T18:48:47.446733Z" } }, "outputs": [], @@ -208,10 +208,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:44.801418Z", - "iopub.status.busy": "2024-11-05T05:22:44.801192Z", - "iopub.status.idle": "2024-11-05T05:22:45.094634Z", - "shell.execute_reply": "2024-11-05T05:22:45.093950Z" + "iopub.execute_input": "2024-11-07T18:48:47.448510Z", + "iopub.status.busy": "2024-11-07T18:48:47.448337Z", + "iopub.status.idle": "2024-11-07T18:48:47.743336Z", + "shell.execute_reply": "2024-11-07T18:48:47.742276Z" } }, "outputs": [], diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index cbbba8c12..ef0fd40e3 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -39,10 +39,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:22:49.320999Z", - "iopub.status.busy": "2024-11-05T05:22:49.320880Z", - "iopub.status.idle": "2024-11-05T05:23:21.537478Z", - "shell.execute_reply": "2024-11-05T05:23:21.536956Z" + "iopub.execute_input": "2024-11-07T18:43:47.311708Z", + "iopub.status.busy": "2024-11-07T18:43:47.311517Z", + "iopub.status.idle": "2024-11-07T18:44:18.512576Z", + "shell.execute_reply": "2024-11-07T18:44:18.511909Z" } }, "outputs": [], @@ -78,10 +78,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:21.539953Z", - "iopub.status.busy": "2024-11-05T05:23:21.539100Z", - "iopub.status.idle": "2024-11-05T05:23:25.880179Z", - "shell.execute_reply": "2024-11-05T05:23:25.879744Z" + "iopub.execute_input": "2024-11-07T18:44:18.515678Z", + "iopub.status.busy": "2024-11-07T18:44:18.515314Z", + "iopub.status.idle": "2024-11-07T18:44:22.880793Z", + "shell.execute_reply": "2024-11-07T18:44:22.880303Z" } }, "outputs": [], @@ -129,10 +129,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:25.881742Z", - "iopub.status.busy": "2024-11-05T05:23:25.881595Z", - "iopub.status.idle": "2024-11-05T05:23:26.758503Z", - "shell.execute_reply": "2024-11-05T05:23:26.758084Z" + "iopub.execute_input": "2024-11-07T18:44:22.883309Z", + "iopub.status.busy": "2024-11-07T18:44:22.883160Z", + "iopub.status.idle": "2024-11-07T18:44:27.048810Z", + "shell.execute_reply": "2024-11-07T18:44:27.048074Z" } }, "outputs": [], @@ -176,10 +176,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:26.760098Z", - "iopub.status.busy": "2024-11-05T05:23:26.759955Z", - "iopub.status.idle": "2024-11-05T05:23:27.849510Z", - "shell.execute_reply": "2024-11-05T05:23:27.849117Z" + "iopub.execute_input": "2024-11-07T18:44:27.051312Z", + "iopub.status.busy": "2024-11-07T18:44:27.051190Z", + "iopub.status.idle": "2024-11-07T18:44:32.358097Z", + "shell.execute_reply": "2024-11-07T18:44:32.357628Z" } }, "outputs": [], @@ -227,10 +227,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:27.850994Z", - "iopub.status.busy": "2024-11-05T05:23:27.850864Z", - "iopub.status.idle": "2024-11-05T05:23:31.609137Z", - "shell.execute_reply": "2024-11-05T05:23:31.608748Z" + "iopub.execute_input": "2024-11-07T18:44:32.359532Z", + "iopub.status.busy": "2024-11-07T18:44:32.359413Z", + "iopub.status.idle": "2024-11-07T18:44:36.164664Z", + "shell.execute_reply": "2024-11-07T18:44:36.164005Z" } }, "outputs": [], @@ -276,10 +276,10 @@ "execution_count": 6, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:23:31.610683Z", - "iopub.status.busy": "2024-11-05T05:23:31.610560Z", - "iopub.status.idle": "2024-11-05T05:23:32.965146Z", - "shell.execute_reply": "2024-11-05T05:23:32.963922Z" + "iopub.execute_input": "2024-11-07T18:44:36.167123Z", + "iopub.status.busy": "2024-11-07T18:44:36.166535Z", + "iopub.status.idle": "2024-11-07T18:44:37.743761Z", + "shell.execute_reply": "2024-11-07T18:44:37.742510Z" } }, "outputs": [], diff --git a/docs/conf.py b/docs/conf.py index e0656bb65..00153f98f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,7 +31,7 @@ extensions = [ ] nbsphinx_allow_errors = True -nbsphinx_execute = 'never' +nbsphinx_execute = "never" autosectionlabel_prefix_document = True nbsphinx_allow_directives = True @@ -49,7 +49,7 @@ myst_enable_extensions = [ myst_heading_anchors = 3 -nbsphinx_kernel_name = 'python3' +nbsphinx_kernel_name = "python3" nbsphinx_execute_arguments = [ "--InlineBackend.figure_formats={'svg', 'pdf'}", "--InlineBackend.rc={'figure.dpi': 96}", @@ -130,8 +130,10 @@ html_context = { html_static_path = ["_static"] html_css_files = ["css/custom_log.css"] + def setup(app): - app.add_css_file('css/custom_log.css') + app.add_css_file("css/custom_log.css") + myst_enable_extensions = [ "dollarmath", diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index ed1ea6139..684b1c8d8 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -33,10 +33,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:10.680191Z", - "iopub.status.busy": "2024-11-05T05:11:10.679710Z", - "iopub.status.idle": "2024-11-05T05:11:39.882385Z", - "shell.execute_reply": "2024-11-05T05:11:39.881827Z" + "iopub.execute_input": "2024-11-07T18:48:52.032229Z", + "iopub.status.busy": "2024-11-07T18:48:52.032105Z", + "iopub.status.idle": "2024-11-07T18:49:20.226042Z", + "shell.execute_reply": "2024-11-07T18:49:20.225562Z" } }, "outputs": [], @@ -49,7 +49,7 @@ ")\n", "\n", "server_process = execute_shell_command(\n", - "\"\"\"\n", + " \"\"\"\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", "--port 30000 --host 0.0.0.0\n", "\"\"\"\n", @@ -70,10 +70,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:39.883923Z", - "iopub.status.busy": "2024-11-05T05:11:39.883721Z", - "iopub.status.idle": "2024-11-05T05:11:40.124980Z", - "shell.execute_reply": "2024-11-05T05:11:40.124557Z" + "iopub.execute_input": "2024-11-07T18:49:20.228006Z", + "iopub.status.busy": "2024-11-07T18:49:20.227572Z", + "iopub.status.idle": "2024-11-07T18:49:20.469885Z", + "shell.execute_reply": "2024-11-07T18:49:20.469518Z" } }, "outputs": [], @@ -101,10 +101,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:40.126564Z", - "iopub.status.busy": "2024-11-05T05:11:40.126369Z", - "iopub.status.idle": "2024-11-05T05:11:40.324316Z", - "shell.execute_reply": "2024-11-05T05:11:40.323693Z" + "iopub.execute_input": "2024-11-07T18:49:20.471956Z", + "iopub.status.busy": "2024-11-07T18:49:20.471811Z", + "iopub.status.idle": "2024-11-07T18:49:20.667997Z", + "shell.execute_reply": "2024-11-07T18:49:20.667630Z" } }, "outputs": [], @@ -115,9 +115,7 @@ "\n", "data = {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", - " \"messages\": [\n", - " {\"role\": \"user\", \"content\": \"What is the capital of France?\"}\n", - " ]\n", + " \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n", "}\n", "\n", "response = requests.post(url, json=data)\n", @@ -136,10 +134,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:40.327043Z", - "iopub.status.busy": "2024-11-05T05:11:40.326759Z", - "iopub.status.idle": "2024-11-05T05:11:41.687336Z", - "shell.execute_reply": "2024-11-05T05:11:41.686855Z" + "iopub.execute_input": "2024-11-07T18:49:20.669977Z", + "iopub.status.busy": "2024-11-07T18:49:20.669826Z", + "iopub.status.idle": "2024-11-07T18:49:22.004855Z", + "shell.execute_reply": "2024-11-07T18:49:22.004472Z" } }, "outputs": [], @@ -171,10 +169,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:41.688676Z", - "iopub.status.busy": "2024-11-05T05:11:41.688527Z", - "iopub.status.idle": "2024-11-05T05:11:42.717140Z", - "shell.execute_reply": "2024-11-05T05:11:42.716452Z" + "iopub.execute_input": "2024-11-07T18:49:22.006983Z", + "iopub.status.busy": "2024-11-07T18:49:22.006858Z", + "iopub.status.idle": "2024-11-07T18:49:23.029098Z", + "shell.execute_reply": "2024-11-07T18:49:23.028697Z" } }, "outputs": [], @@ -197,7 +195,7 @@ "# Handle the streaming output\n", "for chunk in response:\n", " if chunk.choices[0].delta.content:\n", - " print(chunk.choices[0].delta.content, end='', flush=True)" + " print(chunk.choices[0].delta.content, end=\"\", flush=True)" ] }, { @@ -214,10 +212,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:42.720467Z", - "iopub.status.busy": "2024-11-05T05:11:42.720182Z", - "iopub.status.idle": "2024-11-05T05:11:43.480765Z", - "shell.execute_reply": "2024-11-05T05:11:43.480143Z" + "iopub.execute_input": "2024-11-07T18:49:23.031712Z", + "iopub.status.busy": "2024-11-07T18:49:23.031571Z", + "iopub.status.idle": "2024-11-07T18:49:23.787752Z", + "shell.execute_reply": "2024-11-07T18:49:23.787368Z" } }, "outputs": [], @@ -250,10 +248,10 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:43.483575Z", - "iopub.status.busy": "2024-11-05T05:11:43.483295Z", - "iopub.status.idle": "2024-11-05T05:11:44.242950Z", - "shell.execute_reply": "2024-11-05T05:11:44.242248Z" + "iopub.execute_input": "2024-11-07T18:49:23.789840Z", + "iopub.status.busy": "2024-11-07T18:49:23.789702Z", + "iopub.status.idle": "2024-11-07T18:49:24.545631Z", + "shell.execute_reply": "2024-11-07T18:49:24.545241Z" } }, "outputs": [], @@ -290,10 +288,10 @@ "execution_count": 8, "metadata": { "execution": { - "iopub.execute_input": "2024-11-05T05:11:44.245660Z", - "iopub.status.busy": "2024-11-05T05:11:44.245373Z", - "iopub.status.idle": "2024-11-05T05:11:45.591682Z", - "shell.execute_reply": "2024-11-05T05:11:45.591184Z" + "iopub.execute_input": "2024-11-07T18:49:24.547641Z", + "iopub.status.busy": "2024-11-07T18:49:24.547497Z", + "iopub.status.idle": "2024-11-07T18:49:25.888864Z", + "shell.execute_reply": "2024-11-07T18:49:25.888114Z" } }, "outputs": [], diff --git a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb index 25b91b7d1..83576d3d0 100644 --- a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb +++ b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb @@ -71,7 +71,7 @@ "source": [ "import json\n", "import os\n", - "from typing import List\n", + "from typing import List\n", "\n", "import chromadb\n", "\n", @@ -80,7 +80,7 @@ "if not os.path.exists(path_qca):\n", " !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n", "\n", - "with open(path_qca, 'r') as f:\n", + "with open(path_qca, \"r\") as f:\n", " question_context_answers = json.load(f)\n", "\n", "chroma_client = chromadb.PersistentClient()\n", @@ -88,7 +88,7 @@ "if collection.count() == 0:\n", " collection.add(\n", " documents=[qca[\"context\"] for qca in question_context_answers],\n", - " ids=[str(i) for i in range(len(question_context_answers))]\n", + " ids=[str(i) for i in range(len(question_context_answers))],\n", " )" ], "metadata": { @@ -123,7 +123,7 @@ "\n", "load_dotenv()\n", "\n", - "os.environ['TOKENIZERS_PARALLELISM'] = \"false\"\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", "p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n", "p.integrate_with_sglang()\n", @@ -150,10 +150,7 @@ "source": [ "@trace\n", "def retrieval(question: str) -> List[str]:\n", - " return collection.query(\n", - " query_texts=[question],\n", - " n_results=1\n", - " )['documents'][0]" + " return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]" ], "metadata": { "collapsed": false @@ -176,7 +173,9 @@ "@function\n", "def generation_sglang(s, question: str, *context: str):\n", " context = \"\\n\".join(context)\n", - " s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n", + " s += user(\n", + " f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n", + " )\n", " s += assistant(gen(\"answer\"))\n", "\n", "\n", @@ -223,7 +222,9 @@ " return generation(question, *contexts)\n", "\n", "\n", - "rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")" + "rag_pipeline(\n", + " \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n", + ")" ] }, { @@ -271,7 +272,10 @@ "execution_count": null, "outputs": [], "source": [ - "from parea.evals.rag import context_query_relevancy_factory, percent_target_supported_by_context_factory\n", + "from parea.evals.rag import (\n", + " context_query_relevancy_factory,\n", + " percent_target_supported_by_context_factory,\n", + ")\n", "\n", "\n", "context_relevancy_eval = context_query_relevancy_factory()\n", @@ -280,10 +284,7 @@ "\n", "@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n", "def retrieval(question: str) -> List[str]:\n", - " return collection.query(\n", - " query_texts=[question],\n", - " n_results=1\n", - " )['documents'][0]" + " return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]" ], "metadata": { "collapsed": false @@ -310,10 +311,13 @@ "answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n", "answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n", "\n", + "\n", "@function\n", "def generation_sglang(s, question: str, *context: str):\n", " context = \"\\n\".join(context)\n", - " s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n", + " s += user(\n", + " f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n", + " )\n", " s += assistant(gen(\"answer\", max_tokens=1_000))\n", "\n", "\n", @@ -357,7 +361,9 @@ " return generation(question, *contexts)\n", "\n", "\n", - "rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")" + "rag_pipeline(\n", + " \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n", + ")" ], "metadata": { "collapsed": false @@ -402,6 +408,7 @@ "source": [ "!pip install nest-asyncio\n", "import nest_asyncio\n", + "\n", "nest_asyncio.apply()" ], "metadata": { @@ -461,7 +468,7 @@ ], "source": [ "e = p.experiment(\n", - " 'RAG',\n", + " \"RAG\",\n", " data=[\n", " {\n", " \"question\": qca[\"question\"],\n", @@ -469,7 +476,7 @@ " }\n", " for qca in question_context_answers\n", " ],\n", - " func=rag_pipeline\n", + " func=rag_pipeline,\n", ").run()" ], "metadata": { diff --git a/examples/runtime/engine/input_ids.py b/examples/runtime/engine/input_ids.py index fd7eb7e22..168796a81 100644 --- a/examples/runtime/engine/input_ids.py +++ b/examples/runtime/engine/input_ids.py @@ -7,6 +7,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct" + def main(): # Sample prompts. prompts = [ diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index c37cfefbd..2ce6d7459 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -39,7 +39,7 @@ class ModelConfig: revision: Optional[str] = None, context_length: Optional[int] = None, model_override_args: Optional[dict] = None, - is_embedding: Optional[bool] = None + is_embedding: Optional[bool] = None, ) -> None: # Parse args self.model_override_args = json.loads(model_override_args) @@ -52,7 +52,9 @@ class ModelConfig: self.hf_text_config = get_hf_text_config(self.hf_config) # Check model type - self.is_generation = is_generation_model(self.hf_config.architectures, is_embedding) + self.is_generation = is_generation_model( + self.hf_config.architectures, is_embedding + ) self.is_multimodal = is_multimodal_model(self.hf_config.architectures) self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures) diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py index f4f5d2b47..e45dda2cc 100644 --- a/python/sglang/srt/layers/quantization/base_config.py +++ b/python/sglang/srt/layers/quantization/base_config.py @@ -122,16 +122,14 @@ class QuantizationConfig(ABC): """ raise NotImplementedError -def method_has_implemented_embedding( - method_class: Type[QuantizeMethodBase]) -> bool: + +def method_has_implemented_embedding(method_class: Type[QuantizeMethodBase]) -> bool: """ Not all quant methods have embedding implemented, so we need to check that it exists for our given method. We check this by making sure the function has been changed from the base implementation. """ - base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", - None) + base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None) class_embedding = inspect.getattr_static(method_class, "embedding", None) - return (class_embedding is not None - and class_embedding is not base_embedding) + return class_embedding is not None and class_embedding is not base_embedding diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py index c1e758b02..a2d15fc78 100644 --- a/python/sglang/srt/layers/vocab_parallel_embedding.py +++ b/python/sglang/srt/layers/vocab_parallel_embedding.py @@ -27,59 +27,67 @@ DEFAULT_VOCAB_PADDING_SIZE = 64 class UnquantizedEmbeddingMethod(QuantizeMethodBase): """Unquantized method for embeddings.""" - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): """Create weights for embedding layer.""" - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + weight = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) - def apply(self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: return F.linear(x, layer.weight, bias) - def embedding(self, layer: torch.nn.Module, - input_: torch.Tensor) -> torch.Tensor: + def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor: return F.embedding(input_, layer.weight) -def pad_vocab_size(vocab_size: int, - pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: +def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: """Pad the vocab size to the given value.""" return ((vocab_size + pad_to - 1) // pad_to) * pad_to def vocab_range_from_per_partition_vocab_size( - per_partition_vocab_size: int, - rank: int, - offset: int = 0) -> Sequence[int]: + per_partition_vocab_size: int, rank: int, offset: int = 0 +) -> Sequence[int]: index_f = rank * per_partition_vocab_size index_l = index_f + per_partition_vocab_size return index_f + offset, index_l + offset -def vocab_range_from_global_vocab_size(global_vocab_size: int, - rank: int, - world_size: int, - offset: int = 0) -> Sequence[int]: +def vocab_range_from_global_vocab_size( + global_vocab_size: int, rank: int, world_size: int, offset: int = 0 +) -> Sequence[int]: per_partition_vocab_size = divide(global_vocab_size, world_size) - return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, - rank, - offset=offset) + return vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size, rank, offset=offset + ) @dataclass class VocabParallelEmbeddingShardIndices: """Indices for a shard of a vocab parallel embedding.""" + padded_org_vocab_start_index: int padded_org_vocab_end_index: int padded_added_vocab_start_index: int @@ -100,13 +108,11 @@ class VocabParallelEmbeddingShardIndices: @property def num_org_elements_padded(self) -> int: - return (self.padded_org_vocab_end_index - - self.padded_org_vocab_start_index) + return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index @property def num_added_elements_padded(self) -> int: - return (self.padded_added_vocab_end_index - - self.padded_added_vocab_start_index) + return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index @property def num_org_vocab_padding(self) -> int: @@ -122,17 +128,14 @@ class VocabParallelEmbeddingShardIndices: def __post_init__(self): # sanity checks - assert (self.padded_org_vocab_start_index <= - self.padded_org_vocab_end_index) - assert (self.padded_added_vocab_start_index <= - self.padded_added_vocab_end_index) + assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index + assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index - assert (self.added_vocab_start_index <= - self.padded_added_vocab_start_index) + assert self.added_vocab_start_index <= self.padded_added_vocab_start_index assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index @@ -142,20 +145,27 @@ class VocabParallelEmbeddingShardIndices: @torch.jit.script def get_masked_input_and_mask( - input_: torch.Tensor, org_vocab_start_index: int, - org_vocab_end_index: int, num_org_vocab_padding: int, - added_vocab_start_index: int, - added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + input_: torch.Tensor, + org_vocab_start_index: int, + org_vocab_end_index: int, + num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int, +) -> Tuple[torch.Tensor, torch.Tensor]: # torch.jit.script will fuse all of the pointwise ops below # into a single kernel, making it very fast - org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < - org_vocab_end_index) + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) added_vocab_mask = (input_ >= added_vocab_start_index) & ( - input_ < added_vocab_end_index) - added_offset = added_vocab_start_index - ( - org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding - valid_offset = (org_vocab_start_index * - org_vocab_mask) + (added_offset * added_vocab_mask) + input_ < added_vocab_end_index + ) + added_offset = ( + added_vocab_start_index + - (org_vocab_end_index - org_vocab_start_index) + - num_org_vocab_padding + ) + valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + added_offset * added_vocab_mask + ) vocab_mask = org_vocab_mask | added_vocab_mask input_ = vocab_mask * (input_ - valid_offset) return input_, ~vocab_mask @@ -200,15 +210,17 @@ class VocabParallelEmbedding(torch.nn.Module): prefix: full name of the layer in the state dict """ # noqa: E501 - def __init__(self, - num_embeddings: int, - embedding_dim: int, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - enable_tp: bool = True): + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + enable_tp: bool = True, + ): super().__init__() self.enable_tp = enable_tp @@ -223,18 +235,22 @@ class VocabParallelEmbedding(torch.nn.Module): self.padding_size = padding_size self.org_vocab_size = org_num_embeddings or num_embeddings num_added_embeddings = num_embeddings - self.org_vocab_size - self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size, - self.padding_size) + self.org_vocab_size_padded = pad_vocab_size( + self.org_vocab_size, self.padding_size + ) self.num_embeddings_padded = pad_vocab_size( - self.org_vocab_size_padded + num_added_embeddings, - self.padding_size) + self.org_vocab_size_padded + num_added_embeddings, self.padding_size + ) assert self.org_vocab_size_padded <= self.num_embeddings_padded - self.shard_indices = self._get_indices(self.num_embeddings_padded, - self.org_vocab_size_padded, - self.num_embeddings, - self.org_vocab_size, tp_rank, - self.tp_size) + self.shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + tp_rank, + self.tp_size, + ) self.embedding_dim = embedding_dim linear_method = None @@ -248,11 +264,13 @@ class VocabParallelEmbedding(torch.nn.Module): # layer type like ParallelLMHead, this is not important. is_embedding_layer = type(self.__class__) is VocabParallelEmbedding linear_method_implements_embedding = method_has_implemented_embedding( - type(linear_method)) + type(linear_method) + ) if is_embedding_layer and not linear_method_implements_embedding: raise NotImplementedError( f"The class {type(linear_method).__name__} must implement " - "the 'embedding' method, see UnquantizedEmbeddingMethod.") + "the 'embedding' method, see UnquantizedEmbeddingMethod." + ) self.linear_method: QuantizeMethodBase = linear_method @@ -260,53 +278,68 @@ class VocabParallelEmbedding(torch.nn.Module): params_dtype = torch.get_default_dtype() # Divide the weight matrix along the vocaburaly dimension. self.num_added_embeddings = self.num_embeddings - self.org_vocab_size - self.num_embeddings_per_partition = divide(self.num_embeddings_padded, - self.tp_size) - assert (self.shard_indices.num_elements_padded == - self.num_embeddings_per_partition) + self.num_embeddings_per_partition = divide( + self.num_embeddings_padded, self.tp_size + ) + assert ( + self.shard_indices.num_elements_padded == self.num_embeddings_per_partition + ) self.num_org_embeddings_per_partition = ( - self.shard_indices.org_vocab_end_index - - self.shard_indices.org_vocab_start_index) + self.shard_indices.org_vocab_end_index + - self.shard_indices.org_vocab_start_index + ) self.num_added_embeddings_per_partition = ( - self.shard_indices.added_vocab_end_index - - self.shard_indices.added_vocab_start_index) + self.shard_indices.added_vocab_end_index + - self.shard_indices.added_vocab_start_index + ) - self.linear_method.create_weights(self, - self.embedding_dim, - [self.num_embeddings_per_partition], - self.embedding_dim, - self.num_embeddings_padded, - params_dtype=params_dtype, - weight_loader=self.weight_loader) + self.linear_method.create_weights( + self, + self.embedding_dim, + [self.num_embeddings_per_partition], + self.embedding_dim, + self.num_embeddings_padded, + params_dtype=params_dtype, + weight_loader=self.weight_loader, + ) @classmethod - def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int, - vocab_size: int, org_vocab_size: int, tp_rank: int, - tp_size: int) -> VocabParallelEmbeddingShardIndices: + def _get_indices( + cls, + vocab_size_padded: int, + org_vocab_size_padded: int, + vocab_size: int, + org_vocab_size: int, + tp_rank: int, + tp_size: int, + ) -> VocabParallelEmbeddingShardIndices: """Get start and end indices for vocab parallel embedding, following the layout outlined in the class docstring, based on the given tp_rank and tp_size.""" num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded padded_org_vocab_start_index, padded_org_vocab_end_index = ( - vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, - tp_size)) + vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) + ) padded_added_vocab_start_index, padded_added_vocab_end_index = ( - vocab_range_from_global_vocab_size(num_added_embeddings_padded, - tp_rank, - tp_size, - offset=org_vocab_size)) + vocab_range_from_global_vocab_size( + num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size + ) + ) # remove padding - org_vocab_start_index = min(padded_org_vocab_start_index, - org_vocab_size) + org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size) org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size) - added_vocab_start_index = min(padded_added_vocab_start_index, - vocab_size) + added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size) added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size) return VocabParallelEmbeddingShardIndices( - padded_org_vocab_start_index, padded_org_vocab_end_index, - padded_added_vocab_start_index, padded_added_vocab_end_index, - org_vocab_start_index, org_vocab_end_index, - added_vocab_start_index, added_vocab_end_index) + padded_org_vocab_start_index, + padded_org_vocab_end_index, + padded_added_vocab_start_index, + padded_added_vocab_end_index, + org_vocab_start_index, + org_vocab_end_index, + added_vocab_start_index, + added_vocab_end_index, + ) def get_sharded_to_full_mapping(self) -> Optional[List[int]]: """Get a mapping that can be used to reindex the gathered @@ -326,32 +359,49 @@ class VocabParallelEmbedding(torch.nn.Module): added_embeddings: List[int] = [] padding: List[int] = [] for tp_rank in range(self.tp_size): - shard_indices = self._get_indices(self.num_embeddings_padded, - self.org_vocab_size_padded, - self.num_embeddings, - self.org_vocab_size, tp_rank, - self.tp_size) + shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + tp_rank, + self.tp_size, + ) range_start = self.num_embeddings_per_partition * tp_rank range_end = self.num_embeddings_per_partition * (tp_rank + 1) base_embeddings.extend( - range(range_start, - range_start + shard_indices.num_org_elements)) + range(range_start, range_start + shard_indices.num_org_elements) + ) padding.extend( - range(range_start + shard_indices.num_org_elements, - range_start + shard_indices.num_org_elements_padded)) + range( + range_start + shard_indices.num_org_elements, + range_start + shard_indices.num_org_elements_padded, + ) + ) added_embeddings.extend( range( range_start + shard_indices.num_org_elements_padded, - range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements)) + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements, + ) + ) padding.extend( range( - range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements, - range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements_padded)) - assert (range_start + shard_indices.num_org_elements_padded + - shard_indices.num_added_elements_padded == range_end) + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements, + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements_padded, + ) + ) + assert ( + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements_padded + == range_end + ) ret = base_embeddings + added_embeddings + padding assert len(ret) == self.num_embeddings_padded return ret @@ -385,10 +435,14 @@ class VocabParallelEmbedding(torch.nn.Module): # If param packed on the same dim we are sharding on, then # need to adjust offsets of loaded weight by pack_factor. if packed_dim is not None and packed_dim == output_dim: - packed_factor = param.packed_factor if isinstance( - param, BasevLLMParameter) else param.pack_factor - assert loaded_weight.shape[output_dim] == (self.org_vocab_size // - param.packed_factor) + packed_factor = ( + param.packed_factor + if isinstance(param, BasevLLMParameter) + else param.pack_factor + ) + assert loaded_weight.shape[output_dim] == ( + self.org_vocab_size // param.packed_factor + ) start_idx = start_idx // packed_factor shard_size = shard_size // packed_factor else: @@ -396,23 +450,24 @@ class VocabParallelEmbedding(torch.nn.Module): # Copy the data. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - param[:loaded_weight.shape[0]].data.copy_(loaded_weight) - param[loaded_weight.shape[0]:].data.fill_(0) + param[: loaded_weight.shape[0]].data.copy_(loaded_weight) + param[loaded_weight.shape[0] :].data.fill_(0) def forward(self, input_): if self.tp_size > 1: # Build the mask. masked_input, input_mask = get_masked_input_and_mask( - input_, self.shard_indices.org_vocab_start_index, + input_, + self.shard_indices.org_vocab_start_index, self.shard_indices.org_vocab_end_index, self.shard_indices.num_org_vocab_padding, self.shard_indices.added_vocab_start_index, - self.shard_indices.added_vocab_end_index) + self.shard_indices.added_vocab_end_index, + ) else: masked_input = input_ # Get the embeddings. - output_parallel = self.linear_method.embedding(self, - masked_input.long()) + output_parallel = self.linear_method.embedding(self, masked_input.long()) # Mask the output embedding. if self.tp_size > 1: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) @@ -426,9 +481,9 @@ class VocabParallelEmbedding(torch.nn.Module): s = f"num_embeddings={self.num_embeddings_per_partition}" s += f", embedding_dim={self.embedding_dim}" s += f", org_vocab_size={self.org_vocab_size}" - s += f', num_embeddings_padded={self.num_embeddings_padded}' + s += f", num_embeddings_padded={self.num_embeddings_padded}" if self.enable_tp: - s += f', tp_size={self.tp_size}' + s += f", tp_size={self.tp_size}" return s @@ -448,27 +503,38 @@ class ParallelLMHead(VocabParallelEmbedding): padding_size: padding size for the vocabulary. """ - def __init__(self, - num_embeddings: int, - embedding_dim: int, - bias: bool = False, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): - super().__init__(num_embeddings, embedding_dim, params_dtype, - org_num_embeddings, padding_size, quant_config, - prefix) + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + bias: bool = False, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__( + num_embeddings, + embedding_dim, + params_dtype, + org_num_embeddings, + padding_size, + quant_config, + prefix, + ) self.quant_config = quant_config if bias: self.bias = Parameter( - torch.empty(self.num_embeddings_per_partition, - dtype=params_dtype)) - set_weight_attrs(self.bias, { - "output_dim": 0, - "weight_loader": self.weight_loader, - }) + torch.empty(self.num_embeddings_per_partition, dtype=params_dtype) + ) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) else: self.register_parameter("bias", None) diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index 9c5ed14f3..b6555183b 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -86,8 +86,10 @@ class GenerateReqInput: self.parallel_sample_num = self.sampling_params.get("n", 1) else: # isinstance(self.sampling_params, list): self.parallel_sample_num = self.sampling_params[0].get("n", 1) - assert all(self.parallel_sample_num == sampling_params.get("n", 1) for sampling_params in self.sampling_params), ( - "The parallel_sample_num should be the same for all samples in sample params.") + assert all( + self.parallel_sample_num == sampling_params.get("n", 1) + for sampling_params in self.sampling_params + ), "The parallel_sample_num should be the same for all samples in sample params." if self.parallel_sample_num > 1 and self.is_single: self.is_single = False diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 742b91398..79fe1cf9f 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -911,8 +911,7 @@ class ScheduleBatch: keep_indices = [ i for i in range(len(self.reqs)) - if not self.reqs[i].finished() - and self.reqs[i] is not being_chunked_req + if not self.reqs[i].finished() and self.reqs[i] is not being_chunked_req ] if keep_indices is None or len(keep_indices) == 0: @@ -1043,6 +1042,7 @@ class ScheduleBatch: for req in self.reqs: req.started_time = time.time() + @dataclasses.dataclass class ModelWorkerBatch: # The batch id diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index f7933e0ac..f0d191a29 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -224,8 +224,8 @@ class Scheduler: self.forward_ct = 0 self.forward_ct_decode = 0 self.num_generated_tokens = 0 - self.last_stats_tic = time.time() # time of last stats for every iter - self.last_log_tic = time.time() # time of last log for print decode log + self.last_stats_tic = time.time() # time of last stats for every iter + self.last_log_tic = time.time() # time of last log for print decode log self.stream_interval = server_args.stream_interval # Init chunked prefill @@ -566,9 +566,7 @@ class Scheduler: and not self.last_batch.is_empty() ): if self.being_chunked_req: - self.last_batch.filter_batch( - being_chunked_req=self.being_chunked_req - ) + self.last_batch.filter_batch(being_chunked_req=self.being_chunked_req) self.tree_cache.cache_unfinished_req(self.being_chunked_req) # Inflight request keeps its rid but will get a new req_pool_idx. self.req_to_token_pool.free(self.being_chunked_req.req_pool_idx) @@ -628,9 +626,7 @@ class Scheduler: has_inflight = self.being_chunked_req is not None if has_inflight: self.being_chunked_req.init_next_round_input() - self.being_chunked_req = adder.add_inflight_req( - self.being_chunked_req - ) + self.being_chunked_req = adder.add_inflight_req(self.being_chunked_req) if self.lora_paths: lora_set = ( @@ -813,7 +809,8 @@ class Scheduler: embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch) ret = embeddings, model_worker_batch.bid return ret - def get_stats(self,batch: ScheduleBatch): + + def get_stats(self, batch: ScheduleBatch): # TODO: get stats for chunked prefill now = time.time() @@ -829,8 +826,8 @@ class Scheduler: # set stats from prefill if self.stats is not None: # new_seq=self.stats.new_seq - cache_hit_rate=self.stats.cache_hit_rate - token_usage=self.stats.token_usage + cache_hit_rate = self.stats.cache_hit_rate + token_usage = self.stats.token_usage # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 @@ -851,15 +848,19 @@ class Scheduler: # _, next_token_ids, _ = result if batch is not None: num_generation_tokens_iter = len(batch.output_ids) - gen_throughput = round(num_generation_tokens_iter / (now - self.last_stats_tic), 2) + gen_throughput = round( + num_generation_tokens_iter / (now - self.last_stats_tic), 2 + ) for i, req in enumerate(batch.reqs): # NOTE: Batch forward mode is extend befor start decode, if batch.forward_mode.is_extend(): - num_prompt_tokens_iter=len(batch.input_ids)+sum(batch.prefix_lens) + num_prompt_tokens_iter = len(batch.input_ids) + sum( + batch.prefix_lens + ) time_to_first_tokens_iter.append(now - req.started_time) else: - time_per_output_tokens_iter.append(now-self.last_stats_tic) + time_per_output_tokens_iter.append(now - self.last_stats_tic) if req.finished(): time_e2e_requests.append(now - req.created_time) @@ -867,9 +868,10 @@ class Scheduler: num_prompt_tokens_requests.append(len(req.origin_input_ids)) num_generation_tokens_requests.append(len(req.output_ids)) finished_reason_requests.append( - req.finished_reason.to_json() - if req.finished_reason is not None - else None) + req.finished_reason.to_json() + if req.finished_reason is not None + else None + ) return Stats( new_seq=new_seq, @@ -893,7 +895,7 @@ class Scheduler: max_running_requests=self.max_running_requests, ) - def log_stats(self,stats:Stats): + def log_stats(self, stats: Stats): self.metrics_collector.log_stats(stats) def process_batch_result(self, batch: ScheduleBatch, result): @@ -1003,9 +1005,7 @@ class Scheduler: if req.is_retracted: continue - if self.server_args.enable_overlap_schedule and ( - req.finished() - ): + if self.server_args.enable_overlap_schedule and (req.finished()): self.token_to_kv_pool.free(batch.out_cache_loc[i : i + 1]) continue @@ -1031,7 +1031,10 @@ class Scheduler: self.token_to_kv_pool.free_group_end() self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30) - if self.tp_rank == 0 and self.forward_ct_decode % self.server_args.decode_log_interval == 0: + if ( + self.tp_rank == 0 + and self.forward_ct_decode % self.server_args.decode_log_interval == 0 + ): self.print_decode_stats() def add_logprob_return_values( diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 60cfc1be1..78f35903f 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -215,7 +215,7 @@ class TokenizerManager: logprob_start_len, top_logprobs_num, obj.stream, - obj.lora_path + obj.lora_path, ) elif isinstance(obj, EmbeddingReqInput): tokenized_obj = TokenizedEmbeddingReqInput( @@ -290,7 +290,9 @@ class TokenizerManager: # Tokenize all requests objs = [obj[i] for i in range(batch_size)] - tokenized_objs = await asyncio.gather(*(self._tokenize_one_request(obj) for obj in objs)) + tokenized_objs = await asyncio.gather( + *(self._tokenize_one_request(obj) for obj in objs) + ) # Cache the common prefix for parallel sampling for i in range(batch_size): @@ -322,7 +324,9 @@ class TokenizerManager: rid_to_index = {rid: i for i, rid in enumerate(rids)} task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators} while task_map: - done, _ = await asyncio.wait(task_map.keys(), return_when=asyncio.FIRST_COMPLETED) + done, _ = await asyncio.wait( + task_map.keys(), return_when=asyncio.FIRST_COMPLETED + ) for task in done: gen = task_map.pop(task) @@ -367,7 +371,7 @@ class TokenizerManager: if self.server_args.dp_size == 1: res = await self.mem_pool_size return res.size - else: # self.server_args.dp_size > 1 + else: # self.server_args.dp_size > 1 self.mem_pool_size_tmp = [] res = await self.mem_pool_size ret = [r.size for r in res] @@ -399,7 +403,7 @@ class TokenizerManager: self.server_args.load_format = obj.load_format self.model_path = obj.model_path return result.success, result.message - else: # self.server_args.dp_size > 1 + else: # self.server_args.dp_size > 1 self.model_update_tmp = [] result = await self.model_update_result @@ -470,7 +474,7 @@ class TokenizerManager: if isinstance(recv_obj, UpdateWeightReqOutput): if self.server_args.dp_size == 1: self.model_update_result.set_result(recv_obj) - else: # self.server_args.dp_size > 1 + else: # self.server_args.dp_size > 1 self.model_update_tmp.append(recv_obj) # set future if the all results are recevied if len(self.model_update_tmp) == self.server_args.dp_size: @@ -479,7 +483,7 @@ class TokenizerManager: elif isinstance(recv_obj, GetMemPoolSizeReqOutput): if self.server_args.dp_size == 1: self.mem_pool_size.set_result(recv_obj) - else: # self.sever_args.dp_size > 1 + else: # self.sever_args.dp_size > 1 self.mem_pool_size_tmp.append(recv_obj) # set future if the all results are received if len(self.mem_pool_size_tmp) == self.server_args.dp_size: diff --git a/python/sglang/srt/metrics/metrics_collector.py b/python/sglang/srt/metrics/metrics_collector.py index df7d6961d..91a849414 100644 --- a/python/sglang/srt/metrics/metrics_collector.py +++ b/python/sglang/srt/metrics/metrics_collector.py @@ -130,27 +130,65 @@ class Metrics: self.counter_prompt_tokens = Counter( name="sglang:prompt_tokens_total", documentation="Number of prefill tokens processed.", - labelnames=labelnames) + labelnames=labelnames, + ) self.counter_generation_tokens = Counter( name="sglang:generation_tokens_total", documentation="Number of generation tokens processed.", - labelnames=labelnames) + labelnames=labelnames, + ) self.histogram_time_to_first_token = Histogram( name="sglang:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", labelnames=labelnames, buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 15.0, 20.0, 25.0, 30.0 - ]) + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 15.0, + 20.0, + 25.0, + 30.0, + ], + ) self.histogram_time_per_output_token = Histogram( name="sglang:time_per_output_token_seconds", documentation="Histogram of time per output token in seconds.", labelnames=labelnames, buckets=[ - 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, - 1.0, 2.5 - ]) + 0.005, + 0.01, + 0.015, + 0.02, + 0.025, + 0.03, + 0.04, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + ], + ) # Request Stats # Metadata @@ -245,14 +283,19 @@ class PrometheusMetricsCollector(MetricsCollector): stats.num_generation_tokens_requests, ) - self._log_counter(self.metrics.counter_prompt_tokens, - stats.num_prompt_tokens_iter) - self._log_counter(self.metrics.counter_generation_tokens, - stats.num_generation_tokens_iter) - self._log_histogram(self.metrics.histogram_time_to_first_token, - stats.time_to_first_tokens_iter) - self._log_histogram(self.metrics.histogram_time_per_output_token, - stats.time_per_output_tokens_iter) + self._log_counter( + self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter + ) + self._log_counter( + self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_per_output_token, + stats.time_per_output_tokens_iter, + ) # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys) self._log_gauge(self.metrics.num_running_sys, stats.num_running_req) diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py index 3495f24d0..8d988fe8e 100644 --- a/python/sglang/srt/models/gpt2.py +++ b/python/sglang/srt/models/gpt2.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader -#from sglang.srt.layers.activation import get_act_fn +# from sglang.srt.layers.activation import get_act_fn from sglang.srt.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -47,15 +47,14 @@ class GPT2Attention(nn.Module): self, layer_id: int, config: GPT2Config, - cache_config = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size total_num_heads = config.num_attention_heads - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) + tensor_model_parallel_world_size = get_tensor_model_parallel_world_size() assert total_num_heads % tensor_model_parallel_world_size == 0 self.num_heads = total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // total_num_heads @@ -76,11 +75,13 @@ class GPT2Attention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.c_proj", ) - self.attn = RadixAttention(self.num_heads, - self.head_dim, - scaling=self.scale, - num_kv_heads=total_num_heads, - layer_id=layer_id) + self.attn = RadixAttention( + self.num_heads, + self.head_dim, + scaling=self.scale, + num_kv_heads=total_num_heads, + layer_id=layer_id, + ) def forward( self, @@ -119,10 +120,14 @@ class GPT2MLP(nn.Module): quant_config=quant_config, prefix=f"{prefix}.c_proj", ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn( + config.activation_function, quant_config, intermediate_size + ) - def forward(self, hidden_states: torch.Tensor,) -> torch.Tensor: + def forward( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) hidden_states = self.act(hidden_states) hidden_states, _ = self.c_proj(hidden_states) @@ -135,27 +140,20 @@ class GPT2Block(nn.Module): self, layer_id: int, config: GPT2Config, - cache_config = None, - + cache_config=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): super().__init__() hidden_size = config.hidden_size - inner_dim = (config.n_inner if config.n_inner is not None else 4 * - hidden_size) + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPT2Attention(layer_id, - config, - cache_config, - quant_config, - prefix=f"{prefix}.attn") + self.attn = GPT2Attention( + layer_id, config, cache_config, quant_config, prefix=f"{prefix}.attn" + ) self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPT2MLP(inner_dim, - config, - quant_config, - prefix=f"{prefix}.mlp") + self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") def forward( self, @@ -179,13 +177,12 @@ class GPT2Block(nn.Module): return hidden_states - class GPT2Model(nn.Module): def __init__( self, config: GPT2Config, - cache_config = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module): def __init__( self, config: GPT2Config, - cache_config = None, + cache_config=None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() self.config = config self.quant_config = quant_config - self.transformer = GPT2Model(config, - cache_config, - quant_config, - prefix="transformer") + self.transformer = GPT2Model( + config, cache_config, quant_config, prefix="transformer" + ) self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config) @@ -254,8 +250,6 @@ class GPT2LMHeadModel(nn.Module): input_ids, hidden_states, self.lm_head.weight, forward_batch ) - - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters(remove_duplicate=False)) for name, loaded_weight in weights: @@ -280,8 +274,8 @@ class GPT2LMHeadModel(nn.Module): if not name.endswith(".weight"): continue loaded_weight = loaded_weight.t() - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + EntryClass = GPT2LMHeadModel diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 895af0e69..1ed8af0e7 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -419,6 +419,7 @@ def launch_engine( for i in range(len(scheduler_pipe_readers)): scheduler_pipe_readers[i].recv() + def add_prometheus_middleware(app: FastAPI): # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216 from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess @@ -490,6 +491,7 @@ def launch_server( finally: t.join() + def _set_prometheus_env(): # Set prometheus multiprocess directory # sglang uses prometheus multiprocess mode @@ -506,6 +508,7 @@ def _set_prometheus_env(): os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}") + def _set_envs_and_config(server_args: ServerArgs): # Set global environments os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" @@ -763,8 +766,8 @@ class Engine: # runtime server default log level is log # offline engine works in scripts, so we set it to error - if 'log_level' not in kwargs: - kwargs['log_level'] = 'error' + if "log_level" not in kwargs: + kwargs["log_level"] = "error" server_args = ServerArgs(*args, **kwargs) launch_engine(server_args=server_args) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 84d1afbd5..53a493bde 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -448,7 +448,7 @@ class ServerArgs: "--decode-log-interval", type=int, default=ServerArgs.decode_log_interval, - help="The log interval of decode batch" + help="The log interval of decode batch", ) # Data parallelism diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 8a486131f..2c68a22b4 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -742,7 +742,13 @@ def run_mmlu_test( finally: pass - run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size) + run_and_check_memory_leak( + workload_func, + disable_radix_cache, + enable_mixed_chunk, + enable_overlap, + chunked_prefill_size, + ) def run_mulit_request_test( @@ -775,4 +781,10 @@ def run_mulit_request_test( with ThreadPoolExecutor(2) as executor: list(executor.map(run_one, list(range(4)))) - run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size) + run_and_check_memory_leak( + workload_func, + disable_radix_cache, + enable_mixed_chunk, + enable_overlap, + chunked_prefill_size, + ) diff --git a/python/sglang/utils.py b/python/sglang/utils.py index 9c1fc6795..e694dc198 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -349,6 +349,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: def terminate_process(process): from sglang.srt.utils import kill_child_process + kill_child_process(process.pid, include_self=True) diff --git a/rust/test_bindings.py b/rust/test_bindings.py index d81e1451f..c4ecfe3c6 100644 --- a/rust/test_bindings.py +++ b/rust/test_bindings.py @@ -11,7 +11,7 @@ router = router.Router( "http://localhost:30000", "http://localhost:30002", ], - policy="random" + policy="random", ) # Start the router - this will block and run the server diff --git a/scripts/playground/reference_hf.py b/scripts/playground/reference_hf.py index 3f5fe2024..bf56fc3c9 100644 --- a/scripts/playground/reference_hf.py +++ b/scripts/playground/reference_hf.py @@ -104,15 +104,9 @@ if __name__ == "__main__": default="TinyLlama/TinyLlama-1.1B-Chat-v0.4", # default="meta-llama/Llama-2-7b-chat-hf", ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=16) + parser.add_argument("--max-new-tokens", type=int, default=16) - parser.add_argument( - "--dtype", - type=str, - default="float16") + parser.add_argument("--dtype", type=str, default="float16") args = parser.parse_args() diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py index b4c2cde2d..4e3f051e3 100644 --- a/test/srt/models/test_generation_models.py +++ b/test/srt/models/test_generation_models.py @@ -56,7 +56,7 @@ ALL_OTHER_MODELS = [ ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True), ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True), ModelCase("THUDM/glm-4-9b-chat"), - ModelCase("openai-community/gpt2") + ModelCase("openai-community/gpt2"), ] TORCH_DTYPES = [torch.float16] diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index d6ae76b8a..070a0633c 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -3,6 +3,7 @@ python3 -m unittest test_openai_server.TestOpenAIServer.test_batch python3 -m unittest test_openai_server.TestOpenAIServer.test_completion """ + import json import time import unittest diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 3631780da..a95026e20 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -1,6 +1,7 @@ """ python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.test_parallel_sample """ + import json import unittest diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 38781b0e2..0bf46c771 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -110,7 +110,6 @@ class TestSRTEngine(unittest.TestCase): def test_5_prompt_input_ids_consistency(self): prompt = "The capital of UK is" - model_path = DEFAULT_MODEL_NAME_FOR_TEST engine = sgl.Engine(model_path=model_path, random_seed=42, log_level="error") sampling_params = {"temperature": 0, "max_new_tokens": 8} @@ -118,7 +117,9 @@ class TestSRTEngine(unittest.TestCase): tokenizer = get_tokenizer(model_path) token_ids = tokenizer.encode(prompt) - out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)["text"] + out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)[ + "text" + ] engine.shutdown()