Sync from v0.13
This commit is contained in:
126
tests/entrypoints/openai/test_enable_force_include_usage.py
Normal file
126
tests/entrypoints/openai/test_enable_force_include_usage.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def chat_server_with_force_include_usage(request): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"4",
|
||||
"--enable-force-include-usage",
|
||||
"--port",
|
||||
"55857",
|
||||
"--gpu-memory-utilization",
|
||||
"0.2",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
|
||||
async with chat_server_with_force_include_usage.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_with_enable_force_include_usage(
|
||||
chat_client_with_force_include_usage: openai.AsyncOpenAI,
|
||||
):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
]
|
||||
|
||||
stream = await chat_client_with_force_include_usage.chat.completions.create(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(min_tokens=10),
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
last_completion_tokens = 0
|
||||
async for chunk in stream:
|
||||
if not len(chunk.choices):
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert (
|
||||
last_completion_tokens == 0
|
||||
or chunk.usage.completion_tokens > last_completion_tokens
|
||||
or (
|
||||
not chunk.choices
|
||||
and chunk.usage.completion_tokens == last_completion_tokens
|
||||
)
|
||||
)
|
||||
assert chunk.usage.total_tokens == (
|
||||
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
|
||||
)
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def transcription_server_with_force_include_usage():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-num-seqs",
|
||||
"4",
|
||||
"--enforce-eager",
|
||||
"--enable-force-include-usage",
|
||||
"--gpu-memory-utilization",
|
||||
"0.2",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def transcription_client_with_force_include_usage(
|
||||
transcription_server_with_force_include_usage,
|
||||
):
|
||||
async with (
|
||||
transcription_server_with_force_include_usage.get_async_client() as async_client
|
||||
):
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_transcription_with_enable_force_include_usage(
|
||||
transcription_client_with_force_include_usage, winning_call
|
||||
):
|
||||
res = (
|
||||
await transcription_client_with_force_include_usage.audio.transcriptions.create(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
)
|
||||
|
||||
async for chunk in res:
|
||||
if not len(chunk.choices):
|
||||
# final usage sent
|
||||
usage = chunk.usage
|
||||
assert isinstance(usage, dict)
|
||||
assert usage["prompt_tokens"] > 0
|
||||
assert usage["completion_tokens"] > 0
|
||||
assert usage["total_tokens"] > 0
|
||||
else:
|
||||
assert not hasattr(chunk, "usage")
|
||||
Reference in New Issue
Block a user