[Feat] Expose logprob options to sgl.gen API (#503)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
2024-07-09 15:35:39 +08:00
parent d557e9f3b7
commit 02b7258658
7 changed files with 239 additions and 43 deletions
--- a/examples/usage/cot_decoding.py
+++ b/examples/usage/cot_decoding.py
@@ -0,0 +1,121 @@
+from math import exp
+from pprint import pformat
+
+import sglang as sgl
+
+YELLOW = "\033[1;33m"
+GREEN = "\033[1;32m"
+BLUE = "\033[1;34m"
+CLEAR = "\033[1;0m"
+
+
+@sgl.function
+def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
+    """CoT Decoding: http://arxiv.org/abs/2402.10200"""
+
+    if is_chat_model:
+        s += sgl.user("Question: " + question + "\nAnswer:")
+        s += sgl.assistant_begin()
+    else:
+        s += "Question: " + question + "\nAnswer:"
+
+    step_0 = s.fork(1)[0]
+    forks = s.fork(get_top_k)
+    answer_forks = s.fork(get_top_k)
+
+    # decoding step 0
+    step_0 += sgl.gen(
+        "get_top_k",
+        max_tokens=0,
+        return_logprob=True,
+        top_logprobs_num=get_top_k,
+        return_text_in_logprobs=True,
+    )
+    logprobs = step_0.get_meta_info("get_top_k")["decode_top_logprobs"][0]
+
+    print("Decoding step 0:",
+          ", ".join(pformat(token[2]) for token in logprobs))
+    for idx, (f, token) in enumerate(zip(forks, logprobs)):
+        logprob, token_id, text = token
+        f += text
+
+        if text == "<|end_of_text|>":
+            print(
+                f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score=nan, answer=nan){CLEAR}"
+            )
+            continue
+
+        # continue greedy decoding
+        f += sgl.gen(
+            "answer",
+            temperature=0,
+            max_tokens=1024,
+            return_logprob=True,
+            top_logprobs_num=2,
+            return_text_in_logprobs=True,
+        )
+
+        # calculate probability disparity between the top and secondary tokens
+        x1s = [
+            exp(xt[0][0])
+            for xt in f.get_meta_info("answer")["decode_top_logprobs"]
+        ]
+        x2s = [
+            exp(xt[1][0])
+            for xt in f.get_meta_info("answer")["decode_top_logprobs"]
+        ]
+        tokens = [
+            xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]
+        ]
+        delta = (sum(x1s) - sum(x2s)) / len(x1s)
+
+        # extract the answer span (without the '<|end_of_text|>' token)
+        answer_forks[idx] += text + f["answer"] + "\nSo the answer is"
+        answer_forks[idx] += sgl.gen(
+            "answer_span",
+            temperature=0,
+            max_tokens=64,
+            return_logprob=True,
+            top_logprobs_num=2,
+            return_text_in_logprobs=True,
+        )
+        answer = answer_forks[idx]['answer_span'].replace('\n', ' ').strip(':')
+        print(
+            f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score={delta}, answer={answer}){CLEAR}"
+        )
+        generated_text = str(answer_forks[idx])[len("ProgramState("):-1]
+        print(f"{BLUE}{pformat(generated_text)}{CLEAR}")
+
+        if verbose:
+            answer_tokens = [
+                xt[0][2] for xt in answer_forks[idx].get_meta_info(
+                    "answer_span")["decode_top_logprobs"]
+            ]
+            answer_x1s = [
+                exp(xt[0][0]) for xt in answer_forks[idx].get_meta_info(
+                    "answer_span")["decode_top_logprobs"]
+            ]
+            answer_x2s = [
+                exp(xt[1][0]) for xt in answer_forks[idx].get_meta_info(
+                    "answer_span")["decode_top_logprobs"]
+            ]
+
+            for token, x1, x2 in zip(tokens, x1s, x2s):
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
+                      end="")
+            print("\n===========")
+            for token, x1, x2 in zip(answer_tokens, answer_x1s, answer_x2s):
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
+                      end="")
+            print()
+
+
+sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+
+state = cot_decoding.run(
+    question=
+    r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4  weeks?",
+    get_top_k=10,
+    is_chat_model=True,
+    verbose=False,
+)