Add return hidden state in the native API (#3897)

Co-authored-by: Beichen-Ma <mabeichen12@gmail.com> Co-authored-by: Chayenne <zhaochen20@outlook.com>
2025-02-27 01:06:54 -05:00
parent 71ed01833d
commit d6898dd253
9 changed files with 112 additions and 34 deletions
--- a/examples/runtime/engine/hidden_states.py
+++ b/examples/runtime/engine/hidden_states.py
@@ -2,7 +2,9 @@
 Usage:
 python hidden_states.py

-Note that we are actively working on moving return_hidden_states to the sampling_params.
+Note that each time you change the `return_hidden_states` parameter,
+the cuda graph will be recaptured, which might lead to a performance hit.
+So avoid getting hidden states and completions alternately.
 """

 import sglang as sgl
@@ -18,10 +20,14 @@ def main():
    # Create an LLM.
    llm = sgl.Engine(
        model_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-        return_hidden_states=True,
    )

-    sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 10}
+    sampling_params = {
+        "temperature": 0.8,
+        "top_p": 0.95,
+        "max_new_tokens": 10,
+        "return_hidden_states": True,
+    }

    outputs = llm.generate(prompts, sampling_params=sampling_params)
    for prompt, output in zip(prompts, outputs):