fix black in pre-commit (#1940)
This commit is contained in:
@@ -30,6 +30,6 @@ repos:
|
||||
rev: 24.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
additional_dependencies: ['.[jupyter]']
|
||||
types: [python, jupyter]
|
||||
types_or: [python, jupyter]
|
||||
types: [python]
|
||||
- id: black-jupyter
|
||||
types: [jupyter]
|
||||
|
||||
@@ -34,10 +34,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:08.536886Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:08.536763Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:34.725831Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:34.725316Z"
|
||||
"iopub.execute_input": "2024-11-07T18:44:42.063503Z",
|
||||
"iopub.status.busy": "2024-11-07T18:44:42.063379Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.255300Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.254547Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -73,10 +73,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:34.727530Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:34.727333Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.359784Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.359090Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.258292Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.257710Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.611559Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.610842Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -101,10 +101,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.362286Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.362140Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.368711Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.368220Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.613911Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.613746Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.620286Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.619779Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -132,10 +132,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.371313Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.370877Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.376712Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.376230Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.622407Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.622267Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.628290Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.627793Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -164,10 +164,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.378982Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.378597Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.391820Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.391336Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.630585Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.630235Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.643498Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.643007Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -183,10 +183,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.393748Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.393606Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.398645Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.398145Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.645336Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.645196Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.650363Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.649837Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -211,10 +211,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.400683Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.400419Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.406146Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.405661Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.652212Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.652076Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.658633Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.658119Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -241,10 +241,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.408176Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.407884Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:35.413587Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:35.413108Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.660468Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.660325Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:07.666476Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:07.665984Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -271,10 +271,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:35.416090Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:35.415793Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:36.552549Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:36.551870Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:07.668242Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:07.668108Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:08.725709Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:08.725021Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -296,10 +296,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:36.554823Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:36.554680Z",
|
||||
"iopub.status.idle": "2024-11-05T05:08:38.053945Z",
|
||||
"shell.execute_reply": "2024-11-05T05:08:38.053034Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:08.727865Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:08.727721Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:11.165841Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:11.165282Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -335,10 +335,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:08:38.056783Z",
|
||||
"iopub.status.busy": "2024-11-05T05:08:38.056497Z",
|
||||
"iopub.status.idle": "2024-11-05T05:09:04.436030Z",
|
||||
"shell.execute_reply": "2024-11-05T05:09:04.435311Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:11.167853Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:11.167711Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:39.542988Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:39.542135Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -360,10 +360,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:09:04.438987Z",
|
||||
"iopub.status.busy": "2024-11-05T05:09:04.438568Z",
|
||||
"iopub.status.idle": "2024-11-05T05:09:04.485291Z",
|
||||
"shell.execute_reply": "2024-11-05T05:09:04.484829Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:39.545416Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:39.545005Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:39.588793Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:39.588054Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -392,10 +392,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:09:04.487191Z",
|
||||
"iopub.status.busy": "2024-11-05T05:09:04.486929Z",
|
||||
"iopub.status.idle": "2024-11-05T05:09:25.553481Z",
|
||||
"shell.execute_reply": "2024-11-05T05:09:25.552747Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:39.590729Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:39.590446Z",
|
||||
"iopub.status.idle": "2024-11-07T18:45:59.660376Z",
|
||||
"shell.execute_reply": "2024-11-07T18:45:59.659992Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -419,10 +419,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:09:25.555813Z",
|
||||
"iopub.status.busy": "2024-11-05T05:09:25.555666Z",
|
||||
"iopub.status.idle": "2024-11-05T05:09:26.354372Z",
|
||||
"shell.execute_reply": "2024-11-05T05:09:26.353693Z"
|
||||
"iopub.execute_input": "2024-11-07T18:45:59.661779Z",
|
||||
"iopub.status.busy": "2024-11-07T18:45:59.661641Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:00.475726Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:00.475269Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -445,10 +445,7 @@
|
||||
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
|
||||
"\n",
|
||||
"url = \"http://localhost:30030/classify\"\n",
|
||||
"data = {\n",
|
||||
" \"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \n",
|
||||
" \"text\": prompts\n",
|
||||
"}\n",
|
||||
"data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
|
||||
"\n",
|
||||
"responses = requests.post(url, json=data).json()\n",
|
||||
"for response in responses:\n",
|
||||
@@ -460,10 +457,10 @@
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:09:26.356532Z",
|
||||
"iopub.status.busy": "2024-11-05T05:09:26.356327Z",
|
||||
"iopub.status.idle": "2024-11-05T05:09:26.396590Z",
|
||||
"shell.execute_reply": "2024-11-05T05:09:26.395914Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:00.477283Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:00.477025Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:00.525758Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:00.525236Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
|
||||
@@ -35,10 +35,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:21:27.503026Z",
|
||||
"iopub.status.busy": "2024-11-05T05:21:27.502741Z",
|
||||
"iopub.status.idle": "2024-11-05T05:21:49.554631Z",
|
||||
"shell.execute_reply": "2024-11-05T05:21:49.553690Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:04.789536Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:04.789418Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:27.038169Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:27.037540Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -64,10 +64,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:21:49.558275Z",
|
||||
"iopub.status.busy": "2024-11-05T05:21:49.558110Z",
|
||||
"iopub.status.idle": "2024-11-05T05:21:52.717287Z",
|
||||
"shell.execute_reply": "2024-11-05T05:21:52.716842Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:27.040005Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:27.039872Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:30.203840Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:30.203368Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -99,10 +99,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:21:52.721738Z",
|
||||
"iopub.status.busy": "2024-11-05T05:21:52.720908Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:01.770341Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:01.769510Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:30.205880Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:30.205719Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:39.256561Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:39.255880Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -137,10 +137,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:01.772662Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:01.772377Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:04.897499Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:04.896867Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:39.259464Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:39.259309Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:42.384955Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:42.384378Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -179,10 +179,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:04.899754Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:04.899478Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:13.970245Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:13.969779Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:42.387431Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:42.387279Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:51.448572Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:51.447781Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -216,10 +216,10 @@
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:13.972039Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:13.971846Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:14.027421Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:14.027003Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:51.451177Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:51.450952Z",
|
||||
"iopub.status.idle": "2024-11-07T18:46:51.497530Z",
|
||||
"shell.execute_reply": "2024-11-07T18:46:51.496850Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
|
||||
@@ -39,10 +39,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:09:30.637832Z",
|
||||
"iopub.status.busy": "2024-11-05T05:09:30.637709Z",
|
||||
"iopub.status.idle": "2024-11-05T05:09:58.830158Z",
|
||||
"shell.execute_reply": "2024-11-05T05:09:58.829395Z"
|
||||
"iopub.execute_input": "2024-11-07T18:46:54.813876Z",
|
||||
"iopub.status.busy": "2024-11-07T18:46:54.813741Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:24.015527Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:24.014987Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -79,10 +79,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:09:58.833008Z",
|
||||
"iopub.status.busy": "2024-11-05T05:09:58.832805Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:00.187146Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:00.186657Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:24.018153Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:24.017755Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:25.374821Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:25.374397Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -119,10 +119,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:00.189444Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:00.189289Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:03.291891Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:03.291173Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:25.376617Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:25.376495Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:28.482537Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:28.482125Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -165,10 +165,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:03.294389Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:03.294237Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:03.469357Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:03.468661Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:28.484819Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:28.484673Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:28.659814Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:28.659435Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -198,10 +198,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:03.471573Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:03.471430Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:04.977081Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:04.976391Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:28.661844Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:28.661710Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:30.168922Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:30.168600Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -234,10 +234,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:04.979428Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:04.979272Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:08.568761Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:08.568355Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:30.171319Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:30.171176Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:33.760113Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:33.759713Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -273,10 +273,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:08.571102Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:08.570964Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:23.214087Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:23.213664Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:33.762729Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:33.762590Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:34.255316Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:34.254907Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -297,7 +297,10 @@
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"Give me the information of the capital of France in the JSON format.\"},\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" temperature=0,\n",
|
||||
" max_tokens=128,\n",
|
||||
@@ -322,10 +325,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:23.216229Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:23.216076Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:23.884236Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:23.883897Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:34.257393Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:34.257246Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:34.413506Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:34.413172Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -365,10 +368,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:23.886276Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:23.886136Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:23.905880Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:23.905529Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:34.414816Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:34.414541Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:34.431341Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:34.431081Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -427,10 +430,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:23.907468Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:23.907247Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:26.920212Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:26.919865Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:34.432325Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:34.432208Z",
|
||||
"iopub.status.idle": "2024-11-07T18:47:37.444337Z",
|
||||
"shell.execute_reply": "2024-11-07T18:47:37.444000Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -482,10 +485,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:26.922675Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:26.922413Z",
|
||||
"iopub.status.idle": "2024-11-05T05:10:51.961703Z",
|
||||
"shell.execute_reply": "2024-11-05T05:10:51.960846Z"
|
||||
"iopub.execute_input": "2024-11-07T18:47:37.445894Z",
|
||||
"iopub.status.busy": "2024-11-07T18:47:37.445744Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:02.482532Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:02.482042Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -565,10 +568,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:10:51.964749Z",
|
||||
"iopub.status.busy": "2024-11-05T05:10:51.964215Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:05.023450Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:05.023101Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:02.485206Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:02.485064Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:15.521489Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:15.521156Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -660,10 +663,10 @@
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:05.024877Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:05.024561Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:06.358695Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:06.357635Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:15.522794Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:15.522657Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:16.875740Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:16.874847Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
|
||||
@@ -35,10 +35,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:17.227174Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:17.226952Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:42.445791Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:42.444980Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:21.128020Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:21.127898Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:45.310371Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:45.309469Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -72,10 +72,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:42.448147Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:42.447775Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:42.495311Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:42.495027Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:45.313506Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:45.313123Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:45.364918Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:45.364155Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -106,10 +106,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:42.496666Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:42.496524Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:42.540687Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:42.540060Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:45.367776Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:45.367490Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:45.411386Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:45.411134Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -140,10 +140,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:42.542551Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:42.542282Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:42.928542Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:42.928181Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:45.412462Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:45.412351Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:45.768796Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:45.768406Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -176,10 +176,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:42.930093Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:42.929954Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:44.799945Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:44.799562Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:45.770227Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:45.770106Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:47.447065Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:47.446733Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -208,10 +208,10 @@
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:44.801418Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:44.801192Z",
|
||||
"iopub.status.idle": "2024-11-05T05:22:45.094634Z",
|
||||
"shell.execute_reply": "2024-11-05T05:22:45.093950Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:47.448510Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:47.448337Z",
|
||||
"iopub.status.idle": "2024-11-07T18:48:47.743336Z",
|
||||
"shell.execute_reply": "2024-11-07T18:48:47.742276Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
|
||||
@@ -39,10 +39,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:22:49.320999Z",
|
||||
"iopub.status.busy": "2024-11-05T05:22:49.320880Z",
|
||||
"iopub.status.idle": "2024-11-05T05:23:21.537478Z",
|
||||
"shell.execute_reply": "2024-11-05T05:23:21.536956Z"
|
||||
"iopub.execute_input": "2024-11-07T18:43:47.311708Z",
|
||||
"iopub.status.busy": "2024-11-07T18:43:47.311517Z",
|
||||
"iopub.status.idle": "2024-11-07T18:44:18.512576Z",
|
||||
"shell.execute_reply": "2024-11-07T18:44:18.511909Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -78,10 +78,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:23:21.539953Z",
|
||||
"iopub.status.busy": "2024-11-05T05:23:21.539100Z",
|
||||
"iopub.status.idle": "2024-11-05T05:23:25.880179Z",
|
||||
"shell.execute_reply": "2024-11-05T05:23:25.879744Z"
|
||||
"iopub.execute_input": "2024-11-07T18:44:18.515678Z",
|
||||
"iopub.status.busy": "2024-11-07T18:44:18.515314Z",
|
||||
"iopub.status.idle": "2024-11-07T18:44:22.880793Z",
|
||||
"shell.execute_reply": "2024-11-07T18:44:22.880303Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -129,10 +129,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:23:25.881742Z",
|
||||
"iopub.status.busy": "2024-11-05T05:23:25.881595Z",
|
||||
"iopub.status.idle": "2024-11-05T05:23:26.758503Z",
|
||||
"shell.execute_reply": "2024-11-05T05:23:26.758084Z"
|
||||
"iopub.execute_input": "2024-11-07T18:44:22.883309Z",
|
||||
"iopub.status.busy": "2024-11-07T18:44:22.883160Z",
|
||||
"iopub.status.idle": "2024-11-07T18:44:27.048810Z",
|
||||
"shell.execute_reply": "2024-11-07T18:44:27.048074Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -176,10 +176,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:23:26.760098Z",
|
||||
"iopub.status.busy": "2024-11-05T05:23:26.759955Z",
|
||||
"iopub.status.idle": "2024-11-05T05:23:27.849510Z",
|
||||
"shell.execute_reply": "2024-11-05T05:23:27.849117Z"
|
||||
"iopub.execute_input": "2024-11-07T18:44:27.051312Z",
|
||||
"iopub.status.busy": "2024-11-07T18:44:27.051190Z",
|
||||
"iopub.status.idle": "2024-11-07T18:44:32.358097Z",
|
||||
"shell.execute_reply": "2024-11-07T18:44:32.357628Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -227,10 +227,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:23:27.850994Z",
|
||||
"iopub.status.busy": "2024-11-05T05:23:27.850864Z",
|
||||
"iopub.status.idle": "2024-11-05T05:23:31.609137Z",
|
||||
"shell.execute_reply": "2024-11-05T05:23:31.608748Z"
|
||||
"iopub.execute_input": "2024-11-07T18:44:32.359532Z",
|
||||
"iopub.status.busy": "2024-11-07T18:44:32.359413Z",
|
||||
"iopub.status.idle": "2024-11-07T18:44:36.164664Z",
|
||||
"shell.execute_reply": "2024-11-07T18:44:36.164005Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -276,10 +276,10 @@
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:23:31.610683Z",
|
||||
"iopub.status.busy": "2024-11-05T05:23:31.610560Z",
|
||||
"iopub.status.idle": "2024-11-05T05:23:32.965146Z",
|
||||
"shell.execute_reply": "2024-11-05T05:23:32.963922Z"
|
||||
"iopub.execute_input": "2024-11-07T18:44:36.167123Z",
|
||||
"iopub.status.busy": "2024-11-07T18:44:36.166535Z",
|
||||
"iopub.status.idle": "2024-11-07T18:44:37.743761Z",
|
||||
"shell.execute_reply": "2024-11-07T18:44:37.742510Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
|
||||
@@ -31,7 +31,7 @@ extensions = [
|
||||
]
|
||||
|
||||
nbsphinx_allow_errors = True
|
||||
nbsphinx_execute = 'never'
|
||||
nbsphinx_execute = "never"
|
||||
|
||||
autosectionlabel_prefix_document = True
|
||||
nbsphinx_allow_directives = True
|
||||
@@ -49,7 +49,7 @@ myst_enable_extensions = [
|
||||
|
||||
myst_heading_anchors = 3
|
||||
|
||||
nbsphinx_kernel_name = 'python3'
|
||||
nbsphinx_kernel_name = "python3"
|
||||
nbsphinx_execute_arguments = [
|
||||
"--InlineBackend.figure_formats={'svg', 'pdf'}",
|
||||
"--InlineBackend.rc={'figure.dpi': 96}",
|
||||
@@ -130,8 +130,10 @@ html_context = {
|
||||
html_static_path = ["_static"]
|
||||
html_css_files = ["css/custom_log.css"]
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_css_file('css/custom_log.css')
|
||||
app.add_css_file("css/custom_log.css")
|
||||
|
||||
|
||||
myst_enable_extensions = [
|
||||
"dollarmath",
|
||||
|
||||
@@ -33,10 +33,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:10.680191Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:10.679710Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:39.882385Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:39.881827Z"
|
||||
"iopub.execute_input": "2024-11-07T18:48:52.032229Z",
|
||||
"iopub.status.busy": "2024-11-07T18:48:52.032105Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:20.226042Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:20.225562Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -49,7 +49,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"server_process = execute_shell_command(\n",
|
||||
"\"\"\"\n",
|
||||
" \"\"\"\n",
|
||||
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
|
||||
"--port 30000 --host 0.0.0.0\n",
|
||||
"\"\"\"\n",
|
||||
@@ -70,10 +70,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:39.883923Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:39.883721Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:40.124980Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:40.124557Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:20.228006Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:20.227572Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:20.469885Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:20.469518Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -101,10 +101,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:40.126564Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:40.126369Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:40.324316Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:40.323693Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:20.471956Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:20.471811Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:20.667997Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:20.667630Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -115,9 +115,7 @@
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
|
||||
" \"messages\": [\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is the capital of France?\"}\n",
|
||||
" ]\n",
|
||||
" \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, json=data)\n",
|
||||
@@ -136,10 +134,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:40.327043Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:40.326759Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:41.687336Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:41.686855Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:20.669977Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:20.669826Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:22.004855Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:22.004472Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -171,10 +169,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:41.688676Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:41.688527Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:42.717140Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:42.716452Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:22.006983Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:22.006858Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:23.029098Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:23.028697Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -197,7 +195,7 @@
|
||||
"# Handle the streaming output\n",
|
||||
"for chunk in response:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" print(chunk.choices[0].delta.content, end='', flush=True)"
|
||||
" print(chunk.choices[0].delta.content, end=\"\", flush=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -214,10 +212,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:42.720467Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:42.720182Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:43.480765Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:43.480143Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:23.031712Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:23.031571Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:23.787752Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:23.787368Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -250,10 +248,10 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:43.483575Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:43.483295Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:44.242950Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:44.242248Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:23.789840Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:23.789702Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:24.545631Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:24.545241Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -290,10 +288,10 @@
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2024-11-05T05:11:44.245660Z",
|
||||
"iopub.status.busy": "2024-11-05T05:11:44.245373Z",
|
||||
"iopub.status.idle": "2024-11-05T05:11:45.591682Z",
|
||||
"shell.execute_reply": "2024-11-05T05:11:45.591184Z"
|
||||
"iopub.execute_input": "2024-11-07T18:49:24.547641Z",
|
||||
"iopub.status.busy": "2024-11-07T18:49:24.547497Z",
|
||||
"iopub.status.idle": "2024-11-07T18:49:25.888864Z",
|
||||
"shell.execute_reply": "2024-11-07T18:49:25.888114Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
|
||||
@@ -71,7 +71,7 @@
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"from typing import List\n",
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"import chromadb\n",
|
||||
"\n",
|
||||
@@ -80,7 +80,7 @@
|
||||
"if not os.path.exists(path_qca):\n",
|
||||
" !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n",
|
||||
"\n",
|
||||
"with open(path_qca, 'r') as f:\n",
|
||||
"with open(path_qca, \"r\") as f:\n",
|
||||
" question_context_answers = json.load(f)\n",
|
||||
"\n",
|
||||
"chroma_client = chromadb.PersistentClient()\n",
|
||||
@@ -88,7 +88,7 @@
|
||||
"if collection.count() == 0:\n",
|
||||
" collection.add(\n",
|
||||
" documents=[qca[\"context\"] for qca in question_context_answers],\n",
|
||||
" ids=[str(i) for i in range(len(question_context_answers))]\n",
|
||||
" ids=[str(i) for i in range(len(question_context_answers))],\n",
|
||||
" )"
|
||||
],
|
||||
"metadata": {
|
||||
@@ -123,7 +123,7 @@
|
||||
"\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"os.environ['TOKENIZERS_PARALLELISM'] = \"false\"\n",
|
||||
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
||||
"\n",
|
||||
"p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n",
|
||||
"p.integrate_with_sglang()\n",
|
||||
@@ -150,10 +150,7 @@
|
||||
"source": [
|
||||
"@trace\n",
|
||||
"def retrieval(question: str) -> List[str]:\n",
|
||||
" return collection.query(\n",
|
||||
" query_texts=[question],\n",
|
||||
" n_results=1\n",
|
||||
" )['documents'][0]"
|
||||
" return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
@@ -176,7 +173,9 @@
|
||||
"@function\n",
|
||||
"def generation_sglang(s, question: str, *context: str):\n",
|
||||
" context = \"\\n\".join(context)\n",
|
||||
" s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n",
|
||||
" s += user(\n",
|
||||
" f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
|
||||
" )\n",
|
||||
" s += assistant(gen(\"answer\"))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -223,7 +222,9 @@
|
||||
" return generation(question, *contexts)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")"
|
||||
"rag_pipeline(\n",
|
||||
" \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -271,7 +272,10 @@
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from parea.evals.rag import context_query_relevancy_factory, percent_target_supported_by_context_factory\n",
|
||||
"from parea.evals.rag import (\n",
|
||||
" context_query_relevancy_factory,\n",
|
||||
" percent_target_supported_by_context_factory,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"context_relevancy_eval = context_query_relevancy_factory()\n",
|
||||
@@ -280,10 +284,7 @@
|
||||
"\n",
|
||||
"@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n",
|
||||
"def retrieval(question: str) -> List[str]:\n",
|
||||
" return collection.query(\n",
|
||||
" query_texts=[question],\n",
|
||||
" n_results=1\n",
|
||||
" )['documents'][0]"
|
||||
" return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
@@ -310,10 +311,13 @@
|
||||
"answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n",
|
||||
"answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@function\n",
|
||||
"def generation_sglang(s, question: str, *context: str):\n",
|
||||
" context = \"\\n\".join(context)\n",
|
||||
" s += user(f'Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.')\n",
|
||||
" s += user(\n",
|
||||
" f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
|
||||
" )\n",
|
||||
" s += assistant(gen(\"answer\", max_tokens=1_000))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -357,7 +361,9 @@
|
||||
" return generation(question, *contexts)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"rag_pipeline(\"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\")"
|
||||
"rag_pipeline(\n",
|
||||
" \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
@@ -402,6 +408,7 @@
|
||||
"source": [
|
||||
"!pip install nest-asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
],
|
||||
"metadata": {
|
||||
@@ -461,7 +468,7 @@
|
||||
],
|
||||
"source": [
|
||||
"e = p.experiment(\n",
|
||||
" 'RAG',\n",
|
||||
" \"RAG\",\n",
|
||||
" data=[\n",
|
||||
" {\n",
|
||||
" \"question\": qca[\"question\"],\n",
|
||||
@@ -469,7 +476,7 @@
|
||||
" }\n",
|
||||
" for qca in question_context_answers\n",
|
||||
" ],\n",
|
||||
" func=rag_pipeline\n",
|
||||
" func=rag_pipeline,\n",
|
||||
").run()"
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -7,6 +7,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
|
||||
def main():
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
|
||||
@@ -39,7 +39,7 @@ class ModelConfig:
|
||||
revision: Optional[str] = None,
|
||||
context_length: Optional[int] = None,
|
||||
model_override_args: Optional[dict] = None,
|
||||
is_embedding: Optional[bool] = None
|
||||
is_embedding: Optional[bool] = None,
|
||||
) -> None:
|
||||
# Parse args
|
||||
self.model_override_args = json.loads(model_override_args)
|
||||
@@ -52,7 +52,9 @@ class ModelConfig:
|
||||
self.hf_text_config = get_hf_text_config(self.hf_config)
|
||||
|
||||
# Check model type
|
||||
self.is_generation = is_generation_model(self.hf_config.architectures, is_embedding)
|
||||
self.is_generation = is_generation_model(
|
||||
self.hf_config.architectures, is_embedding
|
||||
)
|
||||
self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
|
||||
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
||||
|
||||
|
||||
@@ -122,16 +122,14 @@ class QuantizationConfig(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def method_has_implemented_embedding(
|
||||
method_class: Type[QuantizeMethodBase]) -> bool:
|
||||
|
||||
def method_has_implemented_embedding(method_class: Type[QuantizeMethodBase]) -> bool:
|
||||
"""
|
||||
Not all quant methods have embedding implemented, so we need to check that
|
||||
it exists for our given method. We check this by making sure the function
|
||||
has been changed from the base implementation.
|
||||
"""
|
||||
base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
|
||||
None)
|
||||
base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None)
|
||||
class_embedding = inspect.getattr_static(method_class, "embedding", None)
|
||||
|
||||
return (class_embedding is not None
|
||||
and class_embedding is not base_embedding)
|
||||
return class_embedding is not None and class_embedding is not base_embedding
|
||||
|
||||
@@ -27,59 +27,67 @@ DEFAULT_VOCAB_PADDING_SIZE = 64
|
||||
class UnquantizedEmbeddingMethod(QuantizeMethodBase):
|
||||
"""Unquantized method for embeddings."""
|
||||
|
||||
def create_weights(self, layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: List[int], input_size: int,
|
||||
output_size: int, params_dtype: torch.dtype,
|
||||
**extra_weight_attrs):
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: List[int],
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
"""Create weights for embedding layer."""
|
||||
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype),
|
||||
requires_grad=False)
|
||||
weight = Parameter(
|
||||
torch.empty(
|
||||
sum(output_partition_sizes),
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
||||
layer.register_parameter("weight", weight)
|
||||
set_weight_attrs(weight, extra_weight_attrs)
|
||||
|
||||
def apply(self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
return F.linear(x, layer.weight, bias)
|
||||
|
||||
def embedding(self, layer: torch.nn.Module,
|
||||
input_: torch.Tensor) -> torch.Tensor:
|
||||
def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
|
||||
return F.embedding(input_, layer.weight)
|
||||
|
||||
|
||||
def pad_vocab_size(vocab_size: int,
|
||||
pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
|
||||
def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
|
||||
"""Pad the vocab size to the given value."""
|
||||
return ((vocab_size + pad_to - 1) // pad_to) * pad_to
|
||||
|
||||
|
||||
def vocab_range_from_per_partition_vocab_size(
|
||||
per_partition_vocab_size: int,
|
||||
rank: int,
|
||||
offset: int = 0) -> Sequence[int]:
|
||||
per_partition_vocab_size: int, rank: int, offset: int = 0
|
||||
) -> Sequence[int]:
|
||||
index_f = rank * per_partition_vocab_size
|
||||
index_l = index_f + per_partition_vocab_size
|
||||
return index_f + offset, index_l + offset
|
||||
|
||||
|
||||
def vocab_range_from_global_vocab_size(global_vocab_size: int,
|
||||
rank: int,
|
||||
world_size: int,
|
||||
offset: int = 0) -> Sequence[int]:
|
||||
def vocab_range_from_global_vocab_size(
|
||||
global_vocab_size: int, rank: int, world_size: int, offset: int = 0
|
||||
) -> Sequence[int]:
|
||||
per_partition_vocab_size = divide(global_vocab_size, world_size)
|
||||
return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
|
||||
rank,
|
||||
offset=offset)
|
||||
return vocab_range_from_per_partition_vocab_size(
|
||||
per_partition_vocab_size, rank, offset=offset
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabParallelEmbeddingShardIndices:
|
||||
"""Indices for a shard of a vocab parallel embedding."""
|
||||
|
||||
padded_org_vocab_start_index: int
|
||||
padded_org_vocab_end_index: int
|
||||
padded_added_vocab_start_index: int
|
||||
@@ -100,13 +108,11 @@ class VocabParallelEmbeddingShardIndices:
|
||||
|
||||
@property
|
||||
def num_org_elements_padded(self) -> int:
|
||||
return (self.padded_org_vocab_end_index -
|
||||
self.padded_org_vocab_start_index)
|
||||
return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index
|
||||
|
||||
@property
|
||||
def num_added_elements_padded(self) -> int:
|
||||
return (self.padded_added_vocab_end_index -
|
||||
self.padded_added_vocab_start_index)
|
||||
return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index
|
||||
|
||||
@property
|
||||
def num_org_vocab_padding(self) -> int:
|
||||
@@ -122,17 +128,14 @@ class VocabParallelEmbeddingShardIndices:
|
||||
|
||||
def __post_init__(self):
|
||||
# sanity checks
|
||||
assert (self.padded_org_vocab_start_index <=
|
||||
self.padded_org_vocab_end_index)
|
||||
assert (self.padded_added_vocab_start_index <=
|
||||
self.padded_added_vocab_end_index)
|
||||
assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index
|
||||
assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index
|
||||
|
||||
assert self.org_vocab_start_index <= self.org_vocab_end_index
|
||||
assert self.added_vocab_start_index <= self.added_vocab_end_index
|
||||
|
||||
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
|
||||
assert (self.added_vocab_start_index <=
|
||||
self.padded_added_vocab_start_index)
|
||||
assert self.added_vocab_start_index <= self.padded_added_vocab_start_index
|
||||
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
|
||||
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
|
||||
|
||||
@@ -142,20 +145,27 @@ class VocabParallelEmbeddingShardIndices:
|
||||
|
||||
@torch.jit.script
|
||||
def get_masked_input_and_mask(
|
||||
input_: torch.Tensor, org_vocab_start_index: int,
|
||||
org_vocab_end_index: int, num_org_vocab_padding: int,
|
||||
added_vocab_start_index: int,
|
||||
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
input_: torch.Tensor,
|
||||
org_vocab_start_index: int,
|
||||
org_vocab_end_index: int,
|
||||
num_org_vocab_padding: int,
|
||||
added_vocab_start_index: int,
|
||||
added_vocab_end_index: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# torch.jit.script will fuse all of the pointwise ops below
|
||||
# into a single kernel, making it very fast
|
||||
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
|
||||
org_vocab_end_index)
|
||||
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
|
||||
added_vocab_mask = (input_ >= added_vocab_start_index) & (
|
||||
input_ < added_vocab_end_index)
|
||||
added_offset = added_vocab_start_index - (
|
||||
org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
|
||||
valid_offset = (org_vocab_start_index *
|
||||
org_vocab_mask) + (added_offset * added_vocab_mask)
|
||||
input_ < added_vocab_end_index
|
||||
)
|
||||
added_offset = (
|
||||
added_vocab_start_index
|
||||
- (org_vocab_end_index - org_vocab_start_index)
|
||||
- num_org_vocab_padding
|
||||
)
|
||||
valid_offset = (org_vocab_start_index * org_vocab_mask) + (
|
||||
added_offset * added_vocab_mask
|
||||
)
|
||||
vocab_mask = org_vocab_mask | added_vocab_mask
|
||||
input_ = vocab_mask * (input_ - valid_offset)
|
||||
return input_, ~vocab_mask
|
||||
@@ -200,15 +210,17 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
prefix: full name of the layer in the state dict
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
org_num_embeddings: Optional[int] = None,
|
||||
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
enable_tp: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
org_num_embeddings: Optional[int] = None,
|
||||
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
enable_tp: bool = True,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.enable_tp = enable_tp
|
||||
@@ -223,18 +235,22 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
self.padding_size = padding_size
|
||||
self.org_vocab_size = org_num_embeddings or num_embeddings
|
||||
num_added_embeddings = num_embeddings - self.org_vocab_size
|
||||
self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
|
||||
self.padding_size)
|
||||
self.org_vocab_size_padded = pad_vocab_size(
|
||||
self.org_vocab_size, self.padding_size
|
||||
)
|
||||
self.num_embeddings_padded = pad_vocab_size(
|
||||
self.org_vocab_size_padded + num_added_embeddings,
|
||||
self.padding_size)
|
||||
self.org_vocab_size_padded + num_added_embeddings, self.padding_size
|
||||
)
|
||||
assert self.org_vocab_size_padded <= self.num_embeddings_padded
|
||||
|
||||
self.shard_indices = self._get_indices(self.num_embeddings_padded,
|
||||
self.org_vocab_size_padded,
|
||||
self.num_embeddings,
|
||||
self.org_vocab_size, tp_rank,
|
||||
self.tp_size)
|
||||
self.shard_indices = self._get_indices(
|
||||
self.num_embeddings_padded,
|
||||
self.org_vocab_size_padded,
|
||||
self.num_embeddings,
|
||||
self.org_vocab_size,
|
||||
tp_rank,
|
||||
self.tp_size,
|
||||
)
|
||||
self.embedding_dim = embedding_dim
|
||||
|
||||
linear_method = None
|
||||
@@ -248,11 +264,13 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
# layer type like ParallelLMHead, this is not important.
|
||||
is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
|
||||
linear_method_implements_embedding = method_has_implemented_embedding(
|
||||
type(linear_method))
|
||||
type(linear_method)
|
||||
)
|
||||
if is_embedding_layer and not linear_method_implements_embedding:
|
||||
raise NotImplementedError(
|
||||
f"The class {type(linear_method).__name__} must implement "
|
||||
"the 'embedding' method, see UnquantizedEmbeddingMethod.")
|
||||
"the 'embedding' method, see UnquantizedEmbeddingMethod."
|
||||
)
|
||||
|
||||
self.linear_method: QuantizeMethodBase = linear_method
|
||||
|
||||
@@ -260,53 +278,68 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
params_dtype = torch.get_default_dtype()
|
||||
# Divide the weight matrix along the vocaburaly dimension.
|
||||
self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
|
||||
self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
|
||||
self.tp_size)
|
||||
assert (self.shard_indices.num_elements_padded ==
|
||||
self.num_embeddings_per_partition)
|
||||
self.num_embeddings_per_partition = divide(
|
||||
self.num_embeddings_padded, self.tp_size
|
||||
)
|
||||
assert (
|
||||
self.shard_indices.num_elements_padded == self.num_embeddings_per_partition
|
||||
)
|
||||
self.num_org_embeddings_per_partition = (
|
||||
self.shard_indices.org_vocab_end_index -
|
||||
self.shard_indices.org_vocab_start_index)
|
||||
self.shard_indices.org_vocab_end_index
|
||||
- self.shard_indices.org_vocab_start_index
|
||||
)
|
||||
self.num_added_embeddings_per_partition = (
|
||||
self.shard_indices.added_vocab_end_index -
|
||||
self.shard_indices.added_vocab_start_index)
|
||||
self.shard_indices.added_vocab_end_index
|
||||
- self.shard_indices.added_vocab_start_index
|
||||
)
|
||||
|
||||
self.linear_method.create_weights(self,
|
||||
self.embedding_dim,
|
||||
[self.num_embeddings_per_partition],
|
||||
self.embedding_dim,
|
||||
self.num_embeddings_padded,
|
||||
params_dtype=params_dtype,
|
||||
weight_loader=self.weight_loader)
|
||||
self.linear_method.create_weights(
|
||||
self,
|
||||
self.embedding_dim,
|
||||
[self.num_embeddings_per_partition],
|
||||
self.embedding_dim,
|
||||
self.num_embeddings_padded,
|
||||
params_dtype=params_dtype,
|
||||
weight_loader=self.weight_loader,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
|
||||
vocab_size: int, org_vocab_size: int, tp_rank: int,
|
||||
tp_size: int) -> VocabParallelEmbeddingShardIndices:
|
||||
def _get_indices(
|
||||
cls,
|
||||
vocab_size_padded: int,
|
||||
org_vocab_size_padded: int,
|
||||
vocab_size: int,
|
||||
org_vocab_size: int,
|
||||
tp_rank: int,
|
||||
tp_size: int,
|
||||
) -> VocabParallelEmbeddingShardIndices:
|
||||
"""Get start and end indices for vocab parallel embedding, following the
|
||||
layout outlined in the class docstring, based on the given tp_rank and
|
||||
tp_size."""
|
||||
num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
|
||||
padded_org_vocab_start_index, padded_org_vocab_end_index = (
|
||||
vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
|
||||
tp_size))
|
||||
vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size)
|
||||
)
|
||||
padded_added_vocab_start_index, padded_added_vocab_end_index = (
|
||||
vocab_range_from_global_vocab_size(num_added_embeddings_padded,
|
||||
tp_rank,
|
||||
tp_size,
|
||||
offset=org_vocab_size))
|
||||
vocab_range_from_global_vocab_size(
|
||||
num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size
|
||||
)
|
||||
)
|
||||
# remove padding
|
||||
org_vocab_start_index = min(padded_org_vocab_start_index,
|
||||
org_vocab_size)
|
||||
org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size)
|
||||
org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
|
||||
added_vocab_start_index = min(padded_added_vocab_start_index,
|
||||
vocab_size)
|
||||
added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size)
|
||||
added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
|
||||
return VocabParallelEmbeddingShardIndices(
|
||||
padded_org_vocab_start_index, padded_org_vocab_end_index,
|
||||
padded_added_vocab_start_index, padded_added_vocab_end_index,
|
||||
org_vocab_start_index, org_vocab_end_index,
|
||||
added_vocab_start_index, added_vocab_end_index)
|
||||
padded_org_vocab_start_index,
|
||||
padded_org_vocab_end_index,
|
||||
padded_added_vocab_start_index,
|
||||
padded_added_vocab_end_index,
|
||||
org_vocab_start_index,
|
||||
org_vocab_end_index,
|
||||
added_vocab_start_index,
|
||||
added_vocab_end_index,
|
||||
)
|
||||
|
||||
def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
|
||||
"""Get a mapping that can be used to reindex the gathered
|
||||
@@ -326,32 +359,49 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
added_embeddings: List[int] = []
|
||||
padding: List[int] = []
|
||||
for tp_rank in range(self.tp_size):
|
||||
shard_indices = self._get_indices(self.num_embeddings_padded,
|
||||
self.org_vocab_size_padded,
|
||||
self.num_embeddings,
|
||||
self.org_vocab_size, tp_rank,
|
||||
self.tp_size)
|
||||
shard_indices = self._get_indices(
|
||||
self.num_embeddings_padded,
|
||||
self.org_vocab_size_padded,
|
||||
self.num_embeddings,
|
||||
self.org_vocab_size,
|
||||
tp_rank,
|
||||
self.tp_size,
|
||||
)
|
||||
range_start = self.num_embeddings_per_partition * tp_rank
|
||||
range_end = self.num_embeddings_per_partition * (tp_rank + 1)
|
||||
base_embeddings.extend(
|
||||
range(range_start,
|
||||
range_start + shard_indices.num_org_elements))
|
||||
range(range_start, range_start + shard_indices.num_org_elements)
|
||||
)
|
||||
padding.extend(
|
||||
range(range_start + shard_indices.num_org_elements,
|
||||
range_start + shard_indices.num_org_elements_padded))
|
||||
range(
|
||||
range_start + shard_indices.num_org_elements,
|
||||
range_start + shard_indices.num_org_elements_padded,
|
||||
)
|
||||
)
|
||||
added_embeddings.extend(
|
||||
range(
|
||||
range_start + shard_indices.num_org_elements_padded,
|
||||
range_start + shard_indices.num_org_elements_padded +
|
||||
shard_indices.num_added_elements))
|
||||
range_start
|
||||
+ shard_indices.num_org_elements_padded
|
||||
+ shard_indices.num_added_elements,
|
||||
)
|
||||
)
|
||||
padding.extend(
|
||||
range(
|
||||
range_start + shard_indices.num_org_elements_padded +
|
||||
shard_indices.num_added_elements,
|
||||
range_start + shard_indices.num_org_elements_padded +
|
||||
shard_indices.num_added_elements_padded))
|
||||
assert (range_start + shard_indices.num_org_elements_padded +
|
||||
shard_indices.num_added_elements_padded == range_end)
|
||||
range_start
|
||||
+ shard_indices.num_org_elements_padded
|
||||
+ shard_indices.num_added_elements,
|
||||
range_start
|
||||
+ shard_indices.num_org_elements_padded
|
||||
+ shard_indices.num_added_elements_padded,
|
||||
)
|
||||
)
|
||||
assert (
|
||||
range_start
|
||||
+ shard_indices.num_org_elements_padded
|
||||
+ shard_indices.num_added_elements_padded
|
||||
== range_end
|
||||
)
|
||||
ret = base_embeddings + added_embeddings + padding
|
||||
assert len(ret) == self.num_embeddings_padded
|
||||
return ret
|
||||
@@ -385,10 +435,14 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
# If param packed on the same dim we are sharding on, then
|
||||
# need to adjust offsets of loaded weight by pack_factor.
|
||||
if packed_dim is not None and packed_dim == output_dim:
|
||||
packed_factor = param.packed_factor if isinstance(
|
||||
param, BasevLLMParameter) else param.pack_factor
|
||||
assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
|
||||
param.packed_factor)
|
||||
packed_factor = (
|
||||
param.packed_factor
|
||||
if isinstance(param, BasevLLMParameter)
|
||||
else param.pack_factor
|
||||
)
|
||||
assert loaded_weight.shape[output_dim] == (
|
||||
self.org_vocab_size // param.packed_factor
|
||||
)
|
||||
start_idx = start_idx // packed_factor
|
||||
shard_size = shard_size // packed_factor
|
||||
else:
|
||||
@@ -396,23 +450,24 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
|
||||
# Copy the data.
|
||||
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
|
||||
param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
|
||||
param[loaded_weight.shape[0]:].data.fill_(0)
|
||||
param[: loaded_weight.shape[0]].data.copy_(loaded_weight)
|
||||
param[loaded_weight.shape[0] :].data.fill_(0)
|
||||
|
||||
def forward(self, input_):
|
||||
if self.tp_size > 1:
|
||||
# Build the mask.
|
||||
masked_input, input_mask = get_masked_input_and_mask(
|
||||
input_, self.shard_indices.org_vocab_start_index,
|
||||
input_,
|
||||
self.shard_indices.org_vocab_start_index,
|
||||
self.shard_indices.org_vocab_end_index,
|
||||
self.shard_indices.num_org_vocab_padding,
|
||||
self.shard_indices.added_vocab_start_index,
|
||||
self.shard_indices.added_vocab_end_index)
|
||||
self.shard_indices.added_vocab_end_index,
|
||||
)
|
||||
else:
|
||||
masked_input = input_
|
||||
# Get the embeddings.
|
||||
output_parallel = self.linear_method.embedding(self,
|
||||
masked_input.long())
|
||||
output_parallel = self.linear_method.embedding(self, masked_input.long())
|
||||
# Mask the output embedding.
|
||||
if self.tp_size > 1:
|
||||
output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
|
||||
@@ -426,9 +481,9 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
s = f"num_embeddings={self.num_embeddings_per_partition}"
|
||||
s += f", embedding_dim={self.embedding_dim}"
|
||||
s += f", org_vocab_size={self.org_vocab_size}"
|
||||
s += f', num_embeddings_padded={self.num_embeddings_padded}'
|
||||
s += f", num_embeddings_padded={self.num_embeddings_padded}"
|
||||
if self.enable_tp:
|
||||
s += f', tp_size={self.tp_size}'
|
||||
s += f", tp_size={self.tp_size}"
|
||||
return s
|
||||
|
||||
|
||||
@@ -448,27 +503,38 @@ class ParallelLMHead(VocabParallelEmbedding):
|
||||
padding_size: padding size for the vocabulary.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
bias: bool = False,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
org_num_embeddings: Optional[int] = None,
|
||||
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = ""):
|
||||
super().__init__(num_embeddings, embedding_dim, params_dtype,
|
||||
org_num_embeddings, padding_size, quant_config,
|
||||
prefix)
|
||||
def __init__(
|
||||
self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
bias: bool = False,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
org_num_embeddings: Optional[int] = None,
|
||||
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__(
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
params_dtype,
|
||||
org_num_embeddings,
|
||||
padding_size,
|
||||
quant_config,
|
||||
prefix,
|
||||
)
|
||||
self.quant_config = quant_config
|
||||
if bias:
|
||||
self.bias = Parameter(
|
||||
torch.empty(self.num_embeddings_per_partition,
|
||||
dtype=params_dtype))
|
||||
set_weight_attrs(self.bias, {
|
||||
"output_dim": 0,
|
||||
"weight_loader": self.weight_loader,
|
||||
})
|
||||
torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)
|
||||
)
|
||||
set_weight_attrs(
|
||||
self.bias,
|
||||
{
|
||||
"output_dim": 0,
|
||||
"weight_loader": self.weight_loader,
|
||||
},
|
||||
)
|
||||
else:
|
||||
self.register_parameter("bias", None)
|
||||
|
||||
|
||||
@@ -86,8 +86,10 @@ class GenerateReqInput:
|
||||
self.parallel_sample_num = self.sampling_params.get("n", 1)
|
||||
else: # isinstance(self.sampling_params, list):
|
||||
self.parallel_sample_num = self.sampling_params[0].get("n", 1)
|
||||
assert all(self.parallel_sample_num == sampling_params.get("n", 1) for sampling_params in self.sampling_params), (
|
||||
"The parallel_sample_num should be the same for all samples in sample params.")
|
||||
assert all(
|
||||
self.parallel_sample_num == sampling_params.get("n", 1)
|
||||
for sampling_params in self.sampling_params
|
||||
), "The parallel_sample_num should be the same for all samples in sample params."
|
||||
|
||||
if self.parallel_sample_num > 1 and self.is_single:
|
||||
self.is_single = False
|
||||
|
||||
@@ -911,8 +911,7 @@ class ScheduleBatch:
|
||||
keep_indices = [
|
||||
i
|
||||
for i in range(len(self.reqs))
|
||||
if not self.reqs[i].finished()
|
||||
and self.reqs[i] is not being_chunked_req
|
||||
if not self.reqs[i].finished() and self.reqs[i] is not being_chunked_req
|
||||
]
|
||||
|
||||
if keep_indices is None or len(keep_indices) == 0:
|
||||
@@ -1043,6 +1042,7 @@ class ScheduleBatch:
|
||||
for req in self.reqs:
|
||||
req.started_time = time.time()
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ModelWorkerBatch:
|
||||
# The batch id
|
||||
|
||||
@@ -224,8 +224,8 @@ class Scheduler:
|
||||
self.forward_ct = 0
|
||||
self.forward_ct_decode = 0
|
||||
self.num_generated_tokens = 0
|
||||
self.last_stats_tic = time.time() # time of last stats for every iter
|
||||
self.last_log_tic = time.time() # time of last log for print decode log
|
||||
self.last_stats_tic = time.time() # time of last stats for every iter
|
||||
self.last_log_tic = time.time() # time of last log for print decode log
|
||||
self.stream_interval = server_args.stream_interval
|
||||
|
||||
# Init chunked prefill
|
||||
@@ -566,9 +566,7 @@ class Scheduler:
|
||||
and not self.last_batch.is_empty()
|
||||
):
|
||||
if self.being_chunked_req:
|
||||
self.last_batch.filter_batch(
|
||||
being_chunked_req=self.being_chunked_req
|
||||
)
|
||||
self.last_batch.filter_batch(being_chunked_req=self.being_chunked_req)
|
||||
self.tree_cache.cache_unfinished_req(self.being_chunked_req)
|
||||
# Inflight request keeps its rid but will get a new req_pool_idx.
|
||||
self.req_to_token_pool.free(self.being_chunked_req.req_pool_idx)
|
||||
@@ -628,9 +626,7 @@ class Scheduler:
|
||||
has_inflight = self.being_chunked_req is not None
|
||||
if has_inflight:
|
||||
self.being_chunked_req.init_next_round_input()
|
||||
self.being_chunked_req = adder.add_inflight_req(
|
||||
self.being_chunked_req
|
||||
)
|
||||
self.being_chunked_req = adder.add_inflight_req(self.being_chunked_req)
|
||||
|
||||
if self.lora_paths:
|
||||
lora_set = (
|
||||
@@ -813,7 +809,8 @@ class Scheduler:
|
||||
embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
|
||||
ret = embeddings, model_worker_batch.bid
|
||||
return ret
|
||||
def get_stats(self,batch: ScheduleBatch):
|
||||
|
||||
def get_stats(self, batch: ScheduleBatch):
|
||||
# TODO: get stats for chunked prefill
|
||||
|
||||
now = time.time()
|
||||
@@ -829,8 +826,8 @@ class Scheduler:
|
||||
# set stats from prefill
|
||||
if self.stats is not None:
|
||||
# new_seq=self.stats.new_seq
|
||||
cache_hit_rate=self.stats.cache_hit_rate
|
||||
token_usage=self.stats.token_usage
|
||||
cache_hit_rate = self.stats.cache_hit_rate
|
||||
token_usage = self.stats.token_usage
|
||||
# Iteration stats
|
||||
num_prompt_tokens_iter = 0
|
||||
num_generation_tokens_iter = 0
|
||||
@@ -851,15 +848,19 @@ class Scheduler:
|
||||
# _, next_token_ids, _ = result
|
||||
if batch is not None:
|
||||
num_generation_tokens_iter = len(batch.output_ids)
|
||||
gen_throughput = round(num_generation_tokens_iter / (now - self.last_stats_tic), 2)
|
||||
gen_throughput = round(
|
||||
num_generation_tokens_iter / (now - self.last_stats_tic), 2
|
||||
)
|
||||
|
||||
for i, req in enumerate(batch.reqs):
|
||||
# NOTE: Batch forward mode is extend befor start decode,
|
||||
if batch.forward_mode.is_extend():
|
||||
num_prompt_tokens_iter=len(batch.input_ids)+sum(batch.prefix_lens)
|
||||
num_prompt_tokens_iter = len(batch.input_ids) + sum(
|
||||
batch.prefix_lens
|
||||
)
|
||||
time_to_first_tokens_iter.append(now - req.started_time)
|
||||
else:
|
||||
time_per_output_tokens_iter.append(now-self.last_stats_tic)
|
||||
time_per_output_tokens_iter.append(now - self.last_stats_tic)
|
||||
|
||||
if req.finished():
|
||||
time_e2e_requests.append(now - req.created_time)
|
||||
@@ -867,9 +868,10 @@ class Scheduler:
|
||||
num_prompt_tokens_requests.append(len(req.origin_input_ids))
|
||||
num_generation_tokens_requests.append(len(req.output_ids))
|
||||
finished_reason_requests.append(
|
||||
req.finished_reason.to_json()
|
||||
if req.finished_reason is not None
|
||||
else None)
|
||||
req.finished_reason.to_json()
|
||||
if req.finished_reason is not None
|
||||
else None
|
||||
)
|
||||
|
||||
return Stats(
|
||||
new_seq=new_seq,
|
||||
@@ -893,7 +895,7 @@ class Scheduler:
|
||||
max_running_requests=self.max_running_requests,
|
||||
)
|
||||
|
||||
def log_stats(self,stats:Stats):
|
||||
def log_stats(self, stats: Stats):
|
||||
self.metrics_collector.log_stats(stats)
|
||||
|
||||
def process_batch_result(self, batch: ScheduleBatch, result):
|
||||
@@ -1003,9 +1005,7 @@ class Scheduler:
|
||||
if req.is_retracted:
|
||||
continue
|
||||
|
||||
if self.server_args.enable_overlap_schedule and (
|
||||
req.finished()
|
||||
):
|
||||
if self.server_args.enable_overlap_schedule and (req.finished()):
|
||||
self.token_to_kv_pool.free(batch.out_cache_loc[i : i + 1])
|
||||
continue
|
||||
|
||||
@@ -1031,7 +1031,10 @@ class Scheduler:
|
||||
self.token_to_kv_pool.free_group_end()
|
||||
|
||||
self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
|
||||
if self.tp_rank == 0 and self.forward_ct_decode % self.server_args.decode_log_interval == 0:
|
||||
if (
|
||||
self.tp_rank == 0
|
||||
and self.forward_ct_decode % self.server_args.decode_log_interval == 0
|
||||
):
|
||||
self.print_decode_stats()
|
||||
|
||||
def add_logprob_return_values(
|
||||
|
||||
@@ -215,7 +215,7 @@ class TokenizerManager:
|
||||
logprob_start_len,
|
||||
top_logprobs_num,
|
||||
obj.stream,
|
||||
obj.lora_path
|
||||
obj.lora_path,
|
||||
)
|
||||
elif isinstance(obj, EmbeddingReqInput):
|
||||
tokenized_obj = TokenizedEmbeddingReqInput(
|
||||
@@ -290,7 +290,9 @@ class TokenizerManager:
|
||||
|
||||
# Tokenize all requests
|
||||
objs = [obj[i] for i in range(batch_size)]
|
||||
tokenized_objs = await asyncio.gather(*(self._tokenize_one_request(obj) for obj in objs))
|
||||
tokenized_objs = await asyncio.gather(
|
||||
*(self._tokenize_one_request(obj) for obj in objs)
|
||||
)
|
||||
|
||||
# Cache the common prefix for parallel sampling
|
||||
for i in range(batch_size):
|
||||
@@ -322,7 +324,9 @@ class TokenizerManager:
|
||||
rid_to_index = {rid: i for i, rid in enumerate(rids)}
|
||||
task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
|
||||
while task_map:
|
||||
done, _ = await asyncio.wait(task_map.keys(), return_when=asyncio.FIRST_COMPLETED)
|
||||
done, _ = await asyncio.wait(
|
||||
task_map.keys(), return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
|
||||
for task in done:
|
||||
gen = task_map.pop(task)
|
||||
@@ -367,7 +371,7 @@ class TokenizerManager:
|
||||
if self.server_args.dp_size == 1:
|
||||
res = await self.mem_pool_size
|
||||
return res.size
|
||||
else: # self.server_args.dp_size > 1
|
||||
else: # self.server_args.dp_size > 1
|
||||
self.mem_pool_size_tmp = []
|
||||
res = await self.mem_pool_size
|
||||
ret = [r.size for r in res]
|
||||
@@ -399,7 +403,7 @@ class TokenizerManager:
|
||||
self.server_args.load_format = obj.load_format
|
||||
self.model_path = obj.model_path
|
||||
return result.success, result.message
|
||||
else: # self.server_args.dp_size > 1
|
||||
else: # self.server_args.dp_size > 1
|
||||
self.model_update_tmp = []
|
||||
result = await self.model_update_result
|
||||
|
||||
@@ -470,7 +474,7 @@ class TokenizerManager:
|
||||
if isinstance(recv_obj, UpdateWeightReqOutput):
|
||||
if self.server_args.dp_size == 1:
|
||||
self.model_update_result.set_result(recv_obj)
|
||||
else: # self.server_args.dp_size > 1
|
||||
else: # self.server_args.dp_size > 1
|
||||
self.model_update_tmp.append(recv_obj)
|
||||
# set future if the all results are recevied
|
||||
if len(self.model_update_tmp) == self.server_args.dp_size:
|
||||
@@ -479,7 +483,7 @@ class TokenizerManager:
|
||||
elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
|
||||
if self.server_args.dp_size == 1:
|
||||
self.mem_pool_size.set_result(recv_obj)
|
||||
else: # self.sever_args.dp_size > 1
|
||||
else: # self.sever_args.dp_size > 1
|
||||
self.mem_pool_size_tmp.append(recv_obj)
|
||||
# set future if the all results are received
|
||||
if len(self.mem_pool_size_tmp) == self.server_args.dp_size:
|
||||
|
||||
@@ -130,27 +130,65 @@ class Metrics:
|
||||
self.counter_prompt_tokens = Counter(
|
||||
name="sglang:prompt_tokens_total",
|
||||
documentation="Number of prefill tokens processed.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.counter_generation_tokens = Counter(
|
||||
name="sglang:generation_tokens_total",
|
||||
documentation="Number of generation tokens processed.",
|
||||
labelnames=labelnames)
|
||||
labelnames=labelnames,
|
||||
)
|
||||
self.histogram_time_to_first_token = Histogram(
|
||||
name="sglang:time_to_first_token_seconds",
|
||||
documentation="Histogram of time to first token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
|
||||
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 15.0, 20.0, 25.0, 30.0
|
||||
])
|
||||
0.001,
|
||||
0.005,
|
||||
0.01,
|
||||
0.02,
|
||||
0.04,
|
||||
0.06,
|
||||
0.08,
|
||||
0.1,
|
||||
0.25,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
5.0,
|
||||
7.5,
|
||||
10.0,
|
||||
15.0,
|
||||
20.0,
|
||||
25.0,
|
||||
30.0,
|
||||
],
|
||||
)
|
||||
self.histogram_time_per_output_token = Histogram(
|
||||
name="sglang:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5
|
||||
])
|
||||
0.005,
|
||||
0.01,
|
||||
0.015,
|
||||
0.02,
|
||||
0.025,
|
||||
0.03,
|
||||
0.04,
|
||||
0.05,
|
||||
0.075,
|
||||
0.1,
|
||||
0.15,
|
||||
0.2,
|
||||
0.3,
|
||||
0.4,
|
||||
0.5,
|
||||
0.75,
|
||||
1.0,
|
||||
2.5,
|
||||
],
|
||||
)
|
||||
|
||||
# Request Stats
|
||||
# Metadata
|
||||
@@ -245,14 +283,19 @@ class PrometheusMetricsCollector(MetricsCollector):
|
||||
stats.num_generation_tokens_requests,
|
||||
)
|
||||
|
||||
self._log_counter(self.metrics.counter_prompt_tokens,
|
||||
stats.num_prompt_tokens_iter)
|
||||
self._log_counter(self.metrics.counter_generation_tokens,
|
||||
stats.num_generation_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_to_first_token,
|
||||
stats.time_to_first_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter)
|
||||
self._log_counter(
|
||||
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
|
||||
)
|
||||
self._log_counter(
|
||||
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
|
||||
)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter,
|
||||
)
|
||||
|
||||
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
|
||||
|
||||
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
#from sglang.srt.layers.activation import get_act_fn
|
||||
# from sglang.srt.layers.activation import get_act_fn
|
||||
from sglang.srt.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
@@ -47,15 +47,14 @@ class GPT2Attention(nn.Module):
|
||||
self,
|
||||
layer_id: int,
|
||||
config: GPT2Config,
|
||||
cache_config = None,
|
||||
cache_config=None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
total_num_heads = config.num_attention_heads
|
||||
tensor_model_parallel_world_size = (
|
||||
get_tensor_model_parallel_world_size())
|
||||
tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
|
||||
assert total_num_heads % tensor_model_parallel_world_size == 0
|
||||
self.num_heads = total_num_heads // tensor_model_parallel_world_size
|
||||
self.head_dim = self.hidden_size // total_num_heads
|
||||
@@ -76,11 +75,13 @@ class GPT2Attention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.c_proj",
|
||||
)
|
||||
self.attn = RadixAttention(self.num_heads,
|
||||
self.head_dim,
|
||||
scaling=self.scale,
|
||||
num_kv_heads=total_num_heads,
|
||||
layer_id=layer_id)
|
||||
self.attn = RadixAttention(
|
||||
self.num_heads,
|
||||
self.head_dim,
|
||||
scaling=self.scale,
|
||||
num_kv_heads=total_num_heads,
|
||||
layer_id=layer_id,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -119,10 +120,14 @@ class GPT2MLP(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.c_proj",
|
||||
)
|
||||
self.act = get_act_fn(config.activation_function, quant_config,
|
||||
intermediate_size)
|
||||
self.act = get_act_fn(
|
||||
config.activation_function, quant_config, intermediate_size
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor,) -> torch.Tensor:
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
hidden_states, _ = self.c_fc(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states, _ = self.c_proj(hidden_states)
|
||||
@@ -135,27 +140,20 @@ class GPT2Block(nn.Module):
|
||||
self,
|
||||
layer_id: int,
|
||||
config: GPT2Config,
|
||||
cache_config = None,
|
||||
|
||||
cache_config=None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
hidden_size = config.hidden_size
|
||||
inner_dim = (config.n_inner if config.n_inner is not None else 4 *
|
||||
hidden_size)
|
||||
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
|
||||
|
||||
self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||
self.attn = GPT2Attention(layer_id,
|
||||
config,
|
||||
cache_config,
|
||||
quant_config,
|
||||
prefix=f"{prefix}.attn")
|
||||
self.attn = GPT2Attention(
|
||||
layer_id, config, cache_config, quant_config, prefix=f"{prefix}.attn"
|
||||
)
|
||||
self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||
self.mlp = GPT2MLP(inner_dim,
|
||||
config,
|
||||
quant_config,
|
||||
prefix=f"{prefix}.mlp")
|
||||
self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -179,13 +177,12 @@ class GPT2Block(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
|
||||
class GPT2Model(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: GPT2Config,
|
||||
cache_config = None,
|
||||
cache_config=None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
@@ -229,16 +226,15 @@ class GPT2LMHeadModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: GPT2Config,
|
||||
cache_config = None,
|
||||
cache_config=None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
self.transformer = GPT2Model(config,
|
||||
cache_config,
|
||||
quant_config,
|
||||
prefix="transformer")
|
||||
self.transformer = GPT2Model(
|
||||
config, cache_config, quant_config, prefix="transformer"
|
||||
)
|
||||
self.lm_head = self.transformer.wte
|
||||
|
||||
self.logits_processor = LogitsProcessor(config)
|
||||
@@ -254,8 +250,6 @@ class GPT2LMHeadModel(nn.Module):
|
||||
input_ids, hidden_states, self.lm_head.weight, forward_batch
|
||||
)
|
||||
|
||||
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in weights:
|
||||
@@ -280,8 +274,8 @@ class GPT2LMHeadModel(nn.Module):
|
||||
if not name.endswith(".weight"):
|
||||
continue
|
||||
loaded_weight = loaded_weight.t()
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
EntryClass = GPT2LMHeadModel
|
||||
|
||||
@@ -419,6 +419,7 @@ def launch_engine(
|
||||
for i in range(len(scheduler_pipe_readers)):
|
||||
scheduler_pipe_readers[i].recv()
|
||||
|
||||
|
||||
def add_prometheus_middleware(app: FastAPI):
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216
|
||||
from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
|
||||
@@ -490,6 +491,7 @@ def launch_server(
|
||||
finally:
|
||||
t.join()
|
||||
|
||||
|
||||
def _set_prometheus_env():
|
||||
# Set prometheus multiprocess directory
|
||||
# sglang uses prometheus multiprocess mode
|
||||
@@ -506,6 +508,7 @@ def _set_prometheus_env():
|
||||
os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
|
||||
logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
|
||||
|
||||
|
||||
def _set_envs_and_config(server_args: ServerArgs):
|
||||
# Set global environments
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||
@@ -763,8 +766,8 @@ class Engine:
|
||||
# runtime server default log level is log
|
||||
# offline engine works in scripts, so we set it to error
|
||||
|
||||
if 'log_level' not in kwargs:
|
||||
kwargs['log_level'] = 'error'
|
||||
if "log_level" not in kwargs:
|
||||
kwargs["log_level"] = "error"
|
||||
|
||||
server_args = ServerArgs(*args, **kwargs)
|
||||
launch_engine(server_args=server_args)
|
||||
|
||||
@@ -448,7 +448,7 @@ class ServerArgs:
|
||||
"--decode-log-interval",
|
||||
type=int,
|
||||
default=ServerArgs.decode_log_interval,
|
||||
help="The log interval of decode batch"
|
||||
help="The log interval of decode batch",
|
||||
)
|
||||
|
||||
# Data parallelism
|
||||
|
||||
@@ -742,7 +742,13 @@ def run_mmlu_test(
|
||||
finally:
|
||||
pass
|
||||
|
||||
run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
|
||||
run_and_check_memory_leak(
|
||||
workload_func,
|
||||
disable_radix_cache,
|
||||
enable_mixed_chunk,
|
||||
enable_overlap,
|
||||
chunked_prefill_size,
|
||||
)
|
||||
|
||||
|
||||
def run_mulit_request_test(
|
||||
@@ -775,4 +781,10 @@ def run_mulit_request_test(
|
||||
with ThreadPoolExecutor(2) as executor:
|
||||
list(executor.map(run_one, list(range(4))))
|
||||
|
||||
run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
|
||||
run_and_check_memory_leak(
|
||||
workload_func,
|
||||
disable_radix_cache,
|
||||
enable_mixed_chunk,
|
||||
enable_overlap,
|
||||
chunked_prefill_size,
|
||||
)
|
||||
|
||||
@@ -349,6 +349,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
||||
|
||||
def terminate_process(process):
|
||||
from sglang.srt.utils import kill_child_process
|
||||
|
||||
kill_child_process(process.pid, include_self=True)
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ router = router.Router(
|
||||
"http://localhost:30000",
|
||||
"http://localhost:30002",
|
||||
],
|
||||
policy="random"
|
||||
policy="random",
|
||||
)
|
||||
|
||||
# Start the router - this will block and run the server
|
||||
|
||||
@@ -104,15 +104,9 @@ if __name__ == "__main__":
|
||||
default="TinyLlama/TinyLlama-1.1B-Chat-v0.4",
|
||||
# default="meta-llama/Llama-2-7b-chat-hf",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-new-tokens",
|
||||
type=int,
|
||||
default=16)
|
||||
parser.add_argument("--max-new-tokens", type=int, default=16)
|
||||
|
||||
parser.add_argument(
|
||||
"--dtype",
|
||||
type=str,
|
||||
default="float16")
|
||||
parser.add_argument("--dtype", type=str, default="float16")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ ALL_OTHER_MODELS = [
|
||||
ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
|
||||
ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
|
||||
ModelCase("THUDM/glm-4-9b-chat"),
|
||||
ModelCase("openai-community/gpt2")
|
||||
ModelCase("openai-community/gpt2"),
|
||||
]
|
||||
|
||||
TORCH_DTYPES = [torch.float16]
|
||||
|
||||
@@ -3,6 +3,7 @@ python3 -m unittest test_openai_server.TestOpenAIServer.test_batch
|
||||
python3 -m unittest test_openai_server.TestOpenAIServer.test_completion
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import unittest
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.test_parallel_sample
|
||||
"""
|
||||
|
||||
import json
|
||||
import unittest
|
||||
|
||||
|
||||
@@ -110,7 +110,6 @@ class TestSRTEngine(unittest.TestCase):
|
||||
def test_5_prompt_input_ids_consistency(self):
|
||||
prompt = "The capital of UK is"
|
||||
|
||||
|
||||
model_path = DEFAULT_MODEL_NAME_FOR_TEST
|
||||
engine = sgl.Engine(model_path=model_path, random_seed=42, log_level="error")
|
||||
sampling_params = {"temperature": 0, "max_new_tokens": 8}
|
||||
@@ -118,7 +117,9 @@ class TestSRTEngine(unittest.TestCase):
|
||||
|
||||
tokenizer = get_tokenizer(model_path)
|
||||
token_ids = tokenizer.encode(prompt)
|
||||
out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)["text"]
|
||||
out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)[
|
||||
"text"
|
||||
]
|
||||
|
||||
engine.shutdown()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user