Align completion and chat_completion response to OpenAI API (#4637)
This commit is contained in:
@@ -314,6 +314,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
created = int(time.time())
|
||||||
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
||||||
if not isinstance(ret, list):
|
if not isinstance(ret, list):
|
||||||
ret = [ret]
|
ret = [ret]
|
||||||
@@ -321,13 +322,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|||||||
responses = v1_chat_generate_response(
|
responses = v1_chat_generate_response(
|
||||||
request,
|
request,
|
||||||
ret,
|
ret,
|
||||||
|
created,
|
||||||
to_file=True,
|
to_file=True,
|
||||||
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
||||||
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
responses = v1_generate_response(
|
responses = v1_generate_response(
|
||||||
request, ret, tokenizer_manager, to_file=True
|
request,
|
||||||
|
ret,
|
||||||
|
tokenizer_manager,
|
||||||
|
created,
|
||||||
|
to_file=True,
|
||||||
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -577,7 +584,9 @@ def v1_generate_request(
|
|||||||
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
||||||
|
|
||||||
|
|
||||||
def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
def v1_generate_response(
|
||||||
|
request, ret, tokenizer_manager, created, to_file=False, cache_report=False
|
||||||
|
):
|
||||||
choices = []
|
choices = []
|
||||||
echo = False
|
echo = False
|
||||||
|
|
||||||
@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|||||||
# remain the same but if needed we can change that
|
# remain the same but if needed we can change that
|
||||||
"id": ret[i]["meta_info"]["id"],
|
"id": ret[i]["meta_info"]["id"],
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"created": int(time.time()),
|
"created": created,
|
||||||
"model": request[i].model,
|
"model": request[i].model,
|
||||||
"choices": choice,
|
"choices": choice,
|
||||||
"usage": {
|
"usage": {
|
||||||
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|||||||
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
||||||
)
|
)
|
||||||
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
||||||
|
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
||||||
response = CompletionResponse(
|
response = CompletionResponse(
|
||||||
id=ret[0]["meta_info"]["id"],
|
id=ret[0]["meta_info"]["id"],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
|
created=created,
|
||||||
choices=choices,
|
choices=choices,
|
||||||
usage=UsageInfo(
|
usage=UsageInfo(
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=completion_tokens,
|
completion_tokens=completion_tokens,
|
||||||
total_tokens=prompt_tokens + completion_tokens,
|
total_tokens=prompt_tokens + completion_tokens,
|
||||||
|
prompt_tokens_details=(
|
||||||
|
{"cached_tokens": cached_tokens} if cache_report else None
|
||||||
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|||||||
async def v1_completions(tokenizer_manager, raw_request: Request):
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
||||||
request_json = await raw_request.json()
|
request_json = await raw_request.json()
|
||||||
all_requests = [CompletionRequest(**request_json)]
|
all_requests = [CompletionRequest(**request_json)]
|
||||||
|
created = int(time.time())
|
||||||
adapted_request, request = v1_generate_request(all_requests)
|
adapted_request, request = v1_generate_request(all_requests)
|
||||||
|
|
||||||
if adapted_request.stream:
|
if adapted_request.stream:
|
||||||
@@ -719,6 +734,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
n_prev_tokens = {}
|
n_prev_tokens = {}
|
||||||
prompt_tokens = {}
|
prompt_tokens = {}
|
||||||
completion_tokens = {}
|
completion_tokens = {}
|
||||||
|
cached_tokens = {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for content in tokenizer_manager.generate_request(
|
async for content in tokenizer_manager.generate_request(
|
||||||
adapted_request, raw_request
|
adapted_request, raw_request
|
||||||
@@ -731,6 +748,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
text = content["text"]
|
text = content["text"]
|
||||||
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
||||||
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
||||||
|
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
||||||
|
|
||||||
if not stream_buffer: # The first chunk
|
if not stream_buffer: # The first chunk
|
||||||
if request.echo:
|
if request.echo:
|
||||||
@@ -803,6 +821,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
chunk = CompletionStreamResponse(
|
chunk = CompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
object="text_completion",
|
object="text_completion",
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
@@ -821,14 +840,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
total_completion_tokens = sum(
|
total_completion_tokens = sum(
|
||||||
tokens for tokens in completion_tokens.values()
|
tokens for tokens in completion_tokens.values()
|
||||||
)
|
)
|
||||||
|
cache_report = tokenizer_manager.server_args.enable_cache_report
|
||||||
|
if cache_report:
|
||||||
|
cached_tokens_sum = sum(
|
||||||
|
tokens for tokens in cached_tokens.values()
|
||||||
|
)
|
||||||
|
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
||||||
|
else:
|
||||||
|
prompt_tokens_details = None
|
||||||
usage = UsageInfo(
|
usage = UsageInfo(
|
||||||
prompt_tokens=total_prompt_tokens,
|
prompt_tokens=total_prompt_tokens,
|
||||||
completion_tokens=total_completion_tokens,
|
completion_tokens=total_completion_tokens,
|
||||||
total_tokens=total_prompt_tokens + total_completion_tokens,
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
||||||
|
prompt_tokens_details=prompt_tokens_details,
|
||||||
)
|
)
|
||||||
|
|
||||||
final_usage_chunk = CompletionStreamResponse(
|
final_usage_chunk = CompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[],
|
choices=[],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
usage=usage,
|
usage=usage,
|
||||||
@@ -859,7 +888,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
if not isinstance(ret, list):
|
if not isinstance(ret, list):
|
||||||
ret = [ret]
|
ret = [ret]
|
||||||
|
|
||||||
response = v1_generate_response(request, ret, tokenizer_manager)
|
response = v1_generate_response(
|
||||||
|
request,
|
||||||
|
ret,
|
||||||
|
tokenizer_manager,
|
||||||
|
created,
|
||||||
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
|
|||||||
def v1_chat_generate_response(
|
def v1_chat_generate_response(
|
||||||
request,
|
request,
|
||||||
ret,
|
ret,
|
||||||
|
created,
|
||||||
to_file=False,
|
to_file=False,
|
||||||
cache_report=False,
|
cache_report=False,
|
||||||
tool_call_parser=None,
|
tool_call_parser=None,
|
||||||
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
|
|||||||
# remain the same but if needed we can change that
|
# remain the same but if needed we can change that
|
||||||
"id": ret[i]["meta_info"]["id"],
|
"id": ret[i]["meta_info"]["id"],
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"created": int(time.time()),
|
"created": created,
|
||||||
"model": request[i].model,
|
"model": request[i].model,
|
||||||
"choices": choice,
|
"choices": choice,
|
||||||
"usage": {
|
"usage": {
|
||||||
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
|
|||||||
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
||||||
response = ChatCompletionResponse(
|
response = ChatCompletionResponse(
|
||||||
id=ret[0]["meta_info"]["id"],
|
id=ret[0]["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
model=request.model,
|
model=request.model,
|
||||||
choices=choices,
|
choices=choices,
|
||||||
usage=UsageInfo(
|
usage=UsageInfo(
|
||||||
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
async def v1_chat_completions(
|
||||||
|
tokenizer_manager, raw_request: Request, cache_report=False
|
||||||
|
):
|
||||||
request_json = await raw_request.json()
|
request_json = await raw_request.json()
|
||||||
all_requests = [ChatCompletionRequest(**request_json)]
|
all_requests = [ChatCompletionRequest(**request_json)]
|
||||||
|
created = int(time.time())
|
||||||
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
||||||
|
|
||||||
if adapted_request.stream:
|
if adapted_request.stream:
|
||||||
@@ -1247,6 +1287,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
n_prev_tokens = {}
|
n_prev_tokens = {}
|
||||||
prompt_tokens = {}
|
prompt_tokens = {}
|
||||||
completion_tokens = {}
|
completion_tokens = {}
|
||||||
|
cached_tokens = {}
|
||||||
try:
|
try:
|
||||||
async for content in tokenizer_manager.generate_request(
|
async for content in tokenizer_manager.generate_request(
|
||||||
adapted_request, raw_request
|
adapted_request, raw_request
|
||||||
@@ -1260,6 +1301,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
|
|
||||||
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
||||||
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
||||||
|
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
||||||
if request.logprobs:
|
if request.logprobs:
|
||||||
logprobs = to_openai_style_logprobs(
|
logprobs = to_openai_style_logprobs(
|
||||||
output_token_logprobs=content["meta_info"][
|
output_token_logprobs=content["meta_info"][
|
||||||
@@ -1339,6 +1381,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
chunk = ChatCompletionStreamResponse(
|
chunk = ChatCompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
)
|
)
|
||||||
@@ -1378,6 +1421,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
chunk = ChatCompletionStreamResponse(
|
chunk = ChatCompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
)
|
)
|
||||||
@@ -1414,6 +1458,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
chunk = ChatCompletionStreamResponse(
|
chunk = ChatCompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
)
|
)
|
||||||
@@ -1464,6 +1509,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
chunk = ChatCompletionStreamResponse(
|
chunk = ChatCompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
)
|
)
|
||||||
@@ -1491,6 +1537,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
)
|
)
|
||||||
chunk = ChatCompletionStreamResponse(
|
chunk = ChatCompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[choice_data],
|
choices=[choice_data],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
)
|
)
|
||||||
@@ -1506,14 +1553,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
total_completion_tokens = sum(
|
total_completion_tokens = sum(
|
||||||
tokens for tokens in completion_tokens.values()
|
tokens for tokens in completion_tokens.values()
|
||||||
)
|
)
|
||||||
|
cache_report = tokenizer_manager.server_args.enable_cache_report
|
||||||
|
if cache_report:
|
||||||
|
cached_tokens_sum = sum(
|
||||||
|
tokens for tokens in cached_tokens.values()
|
||||||
|
)
|
||||||
|
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
||||||
|
else:
|
||||||
|
prompt_tokens_details = None
|
||||||
usage = UsageInfo(
|
usage = UsageInfo(
|
||||||
prompt_tokens=total_prompt_tokens,
|
prompt_tokens=total_prompt_tokens,
|
||||||
completion_tokens=total_completion_tokens,
|
completion_tokens=total_completion_tokens,
|
||||||
total_tokens=total_prompt_tokens + total_completion_tokens,
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
||||||
|
prompt_tokens_details=prompt_tokens_details,
|
||||||
)
|
)
|
||||||
|
|
||||||
final_usage_chunk = ChatCompletionStreamResponse(
|
final_usage_chunk = ChatCompletionStreamResponse(
|
||||||
id=content["meta_info"]["id"],
|
id=content["meta_info"]["id"],
|
||||||
|
created=created,
|
||||||
choices=[],
|
choices=[],
|
||||||
model=request.model,
|
model=request.model,
|
||||||
usage=usage,
|
usage=usage,
|
||||||
@@ -1546,6 +1603,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|||||||
response = v1_chat_generate_response(
|
response = v1_chat_generate_response(
|
||||||
request,
|
request,
|
||||||
ret,
|
ret,
|
||||||
|
created,
|
||||||
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
||||||
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
||||||
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
|
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
|
||||||
|
|||||||
Reference in New Issue
Block a user