[OAI Server Refactor] [ChatCompletions & Completions] Implement UsageInfo Processor (#7360)

Co-authored-by: Chang Su <chang.s.su@oracle.com>
This commit is contained in:
yhyang201
2025-06-21 05:51:21 +08:00
committed by GitHub
parent cfb2fb5afc
commit dea2b84bc3
6 changed files with 108 additions and 96 deletions

View File

@@ -18,10 +18,8 @@ from sglang.srt.entrypoints.openai.protocol import (
ErrorResponse,
)
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
from sglang.srt.entrypoints.openai.utils import (
aggregate_token_usage,
to_openai_style_logprobs,
)
from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
from sglang.srt.entrypoints.openai.utils import to_openai_style_logprobs
from sglang.srt.managers.io_struct import GenerateReqInput
logger = logging.getLogger(__name__)
@@ -214,11 +212,12 @@ class OpenAIServingCompletion(OpenAIServingBase):
# Handle final usage chunk
if request.stream_options and request.stream_options.include_usage:
usage = self._calculate_streaming_usage_base(
usage = UsageProcessor.calculate_streaming_usage(
prompt_tokens,
completion_tokens,
cached_tokens,
request.n,
n_choices=request.n,
enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
)
final_usage_chunk = CompletionStreamResponse(
id=content["meta_info"]["id"],
@@ -322,7 +321,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
# Calculate usage
cache_report = self.tokenizer_manager.server_args.enable_cache_report
usage = aggregate_token_usage(ret, request.n, cache_report)
usage = UsageProcessor.calculate_response_usage(
ret, n_choices=request.n, enable_cache_report=cache_report
)
return CompletionResponse(
id=ret[0]["meta_info"]["id"],