Add return hidden state in the native API (#3897)

Co-authored-by: Beichen-Ma <mabeichen12@gmail.com>
Co-authored-by: Chayenne <zhaochen20@outlook.com>
This commit is contained in:
Qiaolin Yu
2025-02-27 01:06:54 -05:00
committed by GitHub
parent 71ed01833d
commit d6898dd253
9 changed files with 112 additions and 34 deletions

View File

@@ -2,7 +2,9 @@
Usage:
python hidden_states.py
Note that we are actively working on moving return_hidden_states to the sampling_params.
Note that each time you change the `return_hidden_states` parameter,
the cuda graph will be recaptured, which might lead to a performance hit.
So avoid getting hidden states and completions alternately.
"""
import sglang as sgl
@@ -18,10 +20,14 @@ def main():
# Create an LLM.
llm = sgl.Engine(
model_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
return_hidden_states=True,
)
sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 10}
sampling_params = {
"temperature": 0.8,
"top_p": 0.95,
"max_new_tokens": 10,
"return_hidden_states": True,
}
outputs = llm.generate(prompts, sampling_params=sampling_params)
for prompt, output in zip(prompts, outputs):