From eb4308c4c9e3efbd58e86ec63e4f88dc36c363a8 Mon Sep 17 00:00:00 2001
From: Arsalan <41029759+amirarsalan90@users.noreply.github.com>
Date: Tue, 12 Mar 2024 03:16:06 -0400
Subject: [PATCH] adding the triton docker build minimal example (#242)

---
 examples/usage/triton/Dockerfile              | 10 +++++
 examples/usage/triton/README.md               | 41 +++++++++++++++++
 .../models/character_generation/1/model.py    | 45 +++++++++++++++++++
 .../models/character_generation/config.pbtxt  | 23 ++++++++++
 4 files changed, 119 insertions(+)
 create mode 100644 examples/usage/triton/Dockerfile
 create mode 100644 examples/usage/triton/README.md
 create mode 100644 examples/usage/triton/models/character_generation/1/model.py
 create mode 100644 examples/usage/triton/models/character_generation/config.pbtxt

diff --git a/examples/usage/triton/Dockerfile b/examples/usage/triton/Dockerfile
new file mode 100644
index 000000000..d97342e26
--- /dev/null
+++ b/examples/usage/triton/Dockerfile
@@ -0,0 +1,10 @@
+FROM nvcr.io/nvidia/tritonserver:24.01-py3
+
+WORKDIR /opt
+
+RUN git clone https://github.com/sgl-project/sglang.git
+
+WORKDIR /opt/sglang
+RUN pip install --upgrade pip && \
+    pip install -e "python[all]" && \
+    pip install datasets
\ No newline at end of file
diff --git a/examples/usage/triton/README.md b/examples/usage/triton/README.md
new file mode 100644
index 000000000..48a9c8354
--- /dev/null
+++ b/examples/usage/triton/README.md
@@ -0,0 +1,41 @@
+# sglang_triton
+
+Build the docker image:
+```
+docker build -t sglang-triton .
+```
+
+Then do:
+```
+docker run -ti --gpus=all --network=host --name sglang-triton -v ./models:/mnt/models sglang-triton
+```
+
+inside the docker container:
+```
+cd sglang
+python3 -m sglang.launch_server --model-path mistralai/Mistral-7B-Instruct-v0.2 --port 30000 --mem-fraction-static 0.9
+```
+
+with another shell, inside the docker container:
+```
+docker exec -ti sglang-triton /bin/bash
+cd /mnt
+tritonserver --model-repository=/mnt/models
+```
+
+
+Send request to the server:
+```
+curl -X POST http://localhost:8000/v2/models/character_generation/generate \
+     -H "Content-Type: application/json" \
+     -d '{
+           "inputs": [
+               {
+                   "name": "INPUT_TEXT",
+                   "datatype": "STRING",
+                   "shape": [1],
+                   "data": ["Name1"]
+               }
+           ]
+         }'
+```
\ No newline at end of file
diff --git a/examples/usage/triton/models/character_generation/1/model.py b/examples/usage/triton/models/character_generation/1/model.py
new file mode 100644
index 000000000..e76992f95
--- /dev/null
+++ b/examples/usage/triton/models/character_generation/1/model.py
@@ -0,0 +1,45 @@
+import triton_python_backend_utils as pb_utils
+import numpy
+import sglang as sgl
+from sglang import function, set_default_backend
+from sglang.srt.constrained import build_regex_from_object
+
+from pydantic import BaseModel
+
+sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+
+class Character(BaseModel):
+    name: str
+    eye_color: str
+    house: str
+
+@function
+def character_gen(s, name):
+    s += (
+        name
+        + " is a character in Harry Potter. Please fill in the following information about this character.\n"
+    )
+    s += sgl.gen("json_output", max_tokens=256, regex=build_regex_from_object(Character))
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        print("Initialized.")
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            tensor_in = pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT")
+            if tensor_in is None:
+                return pb_utils.InferenceResponse(output_tensors=[])
+            
+            input_list_names = [i.decode('utf-8') if isinstance(i, bytes) else i for i in tensor_in.as_numpy().tolist()]
+
+            input_list_dicts = [{"name":i} for i in input_list_names]
+
+            states = character_gen.run_batch(input_list_dicts)
+            character_strs = [state.text() for state in states]
+
+            tensor_out = pb_utils.Tensor("OUTPUT_TEXT", numpy.array(character_strs, dtype=object))
+
+            responses.append(pb_utils.InferenceResponse(output_tensors = [tensor_out]))
+        return responses
\ No newline at end of file
diff --git a/examples/usage/triton/models/character_generation/config.pbtxt b/examples/usage/triton/models/character_generation/config.pbtxt
new file mode 100644
index 000000000..7546f993a
--- /dev/null
+++ b/examples/usage/triton/models/character_generation/config.pbtxt
@@ -0,0 +1,23 @@
+name: "character_generation"
+backend: "python"
+input [
+    {
+        name: "INPUT_TEXT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+output [
+    {
+        name: "OUTPUT_TEXT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+instance_group [
+    {
+        count: 1
+        kind: KIND_GPU
+        gpus: [ 0 ]
+    }
+]