2024-01-08 04:37:50 +00:00
|
|
|
"""
|
2024-01-29 17:05:42 -08:00
|
|
|
Usage:
|
2024-01-08 04:37:50 +00:00
|
|
|
python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --port 30000
|
2024-01-29 17:05:42 -08:00
|
|
|
python3 test_httpserver_decode.py
|
|
|
|
|
|
2024-01-08 04:37:50 +00:00
|
|
|
|
|
|
|
|
Output:
|
|
|
|
|
The capital of France is Paris.\nThe capital of the United States is Washington, D.C.\nThe capital of Canada is Ottawa.\nThe capital of Japan is Tokyo
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
|
|
|
|
parser.add_argument("--port", type=int, default=30000)
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
url = f"{args.host}:{args.port}"
|
|
|
|
|
|
|
|
|
|
response = requests.post(
|
|
|
|
|
url + "/generate",
|
|
|
|
|
json={
|
|
|
|
|
"text": "The capital of France is",
|
|
|
|
|
"sampling_params": {
|
|
|
|
|
"temperature": 0,
|
|
|
|
|
"max_new_tokens": 32,
|
|
|
|
|
},
|
2024-01-23 05:07:30 -08:00
|
|
|
# "return_logprob": True,
|
|
|
|
|
# "logprob_start_len": 0,
|
2024-01-08 04:37:50 +00:00
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
print(response.json())
|