enginex-mthreads-vllm/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import jsonschema
import openai
import pytest
import pytest_asyncio
from rapidfuzz import fuzz

from ....utils import RemoteOpenAIServer

MODEL_NAME = "openai/gpt-oss-20b"


@pytest.fixture(scope="module")
def server():
    args = [
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--enable-auto-tool-choice",
        "--tool-call-parser",
        "openai",
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    """Async fixture providing an OpenAI-compatible vLLM client."""
    async with server.get_async_client() as async_client:
        yield async_client


# ==========================================================
# Tool Definitions
# ==========================================================
TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "calculator",
            "description": "Performs basic arithmetic calculations.",
            "parameters": {
                "type": "object",
                "properties": {
                    "expression": {
                        "type": "string",
                        "description": (
                            "Arithmetic expression to evaluate, e.g. '123 + 456'."
                        ),
                    }
                },
                "required": ["expression"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_time",
            "description": "Retrieves the current local time for a given city.",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "City name, e.g. 'New York'.",
                    }
                },
                "required": ["city"],
            },
        },
    },
]


# ==========================================================
# Message Examples
# ==========================================================
MESSAGES_CALC = [
    {"role": "user", "content": "Calculate 123 + 456 using the calculator."}
]

MESSAGES_GET_TIME = [
    {"role": "user", "content": "What is the current time in New York?"}
]

MESSAGES_MULTIPLE_CALLS = [
    {
        "role": "system",
        "content": (
            "You can call multiple tools. "
            "When using more than one, return single JSON object with tool_calls array"
            "containing each tool call with its function name and arguments. "
            "Do not output multiple JSON objects separately."
        ),
    },
    {
        "role": "user",
        "content": "First, calculate 7 * 8 using the calculator. "
        "Then, use get_time to tell me the current time in New York.",
    },
]

MESSAGES_INVALID_CALL = [
    {
        "role": "user",
        "content": "Can you help with something, "
        "but don’t actually perform any calculation?",
    }
]


# Expected outputs
FUNC_CALC = "calculator"
FUNC_ARGS_CALC = '{"expression":"123 + 456"}'

FUNC_TIME = "get_time"
FUNC_ARGS_TIME = '{"city": "New York"}'


# ==========================================================
# Utility to extract reasoning and tool calls
# ==========================================================
def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]:
    """
    Extract accumulated reasoning text and tool call arguments
    from streaming chunks.
    """
    reasoning_content: str = ""
    tool_calls: dict[int, dict[str, str]] = {}

    for chunk in chunks:
        choice = getattr(chunk.choices[0], "delta", None)
        if not choice:
            continue

        if hasattr(choice, "reasoning_content") and choice.reasoning_content:
            reasoning_content += choice.reasoning_content

        for tc in getattr(choice, "tool_calls", []) or []:
            idx = getattr(tc, "index", 0)
            tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""})

            if getattr(tc, "function", None):
                func = tc.function
                if getattr(func, "name", None):
                    tool_entry["name"] = func.name
                if getattr(func, "arguments", None):
                    tool_entry["arguments"] += func.arguments

    function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())]
    arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())]

    return reasoning_content, arguments, function_names


# ==========================================================
# Test Scenarios
# ==========================================================
@pytest.mark.asyncio
async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI):
    """Verify calculator tool call is made and arguments are accurate."""

    response = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=MESSAGES_CALC,
        tools=TOOLS,
        temperature=0.0,
        stream=False,
    )

    message = response.choices[0].message
    tool_calls = getattr(message, "tool_calls", [])
    assert tool_calls, "No tool calls detected"

    calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None)
    assert calc_call, "Calculator function not called"

    raw_args = calc_call.function.arguments
    assert raw_args, "Calculator arguments missing"
    assert "123" in raw_args and "456" in raw_args, (
        f"Expected values not in raw arguments: {raw_args}"
    )

    try:
        parsed_args = json.loads(raw_args)
    except json.JSONDecodeError:
        pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}")

    expected_expr = "123 + 456"
    actual_expr = parsed_args.get("expression", "")
    similarity = fuzz.ratio(actual_expr, expected_expr)

    assert similarity > 90, (
        f"Expression mismatch: expected '{expected_expr}' "
        f"got '{actual_expr}' (similarity={similarity}%)"
    )


@pytest.mark.asyncio
async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI):
    """Verify streamed reasoning and tool call behavior for get_time."""

    stream = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=MESSAGES_GET_TIME,
        tools=TOOLS,
        temperature=0.0,
        stream=True,
    )

    chunks = [chunk async for chunk in stream]
    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)

    assert FUNC_TIME in function_names, "get_time function not called"

    assert any("New York" in arg for arg in arguments), (
        f"Expected get_time arguments for New York not found in {arguments}"
    )

    assert len(reasoning) > 0, "Expected reasoning content missing"

    assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), (
        f"Reasoning is not relevant to the request: {reasoning}"
    )


@pytest.mark.asyncio
async def test_streaming_multiple_tools(client: openai.AsyncOpenAI):
    """Test streamed multi-tool response with reasoning."""
    stream = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=MESSAGES_MULTIPLE_CALLS,
        tools=TOOLS,
        temperature=0.0,
        stream=True,
    )

    chunks = [chunk async for chunk in stream]
    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)

    try:
        assert FUNC_CALC in function_names, (
            f"Calculator tool missing — found {function_names}"
        )
        assert FUNC_TIME in function_names, (
            f"Time tool missing — found {function_names}"
        )
        assert len(reasoning) > 0, "Expected reasoning content in streamed response"
    except AssertionError as e:
        print(f"ERROR: {e}")


@pytest.mark.asyncio
async def test_invalid_tool_call(client: openai.AsyncOpenAI):
    """
    Verify that ambiguous instructions that should not trigger a tool
    do not produce any tool calls.
    """
    response = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=MESSAGES_INVALID_CALL,
        tools=TOOLS,
        temperature=0.0,
        stream=False,
    )

    message = response.choices[0].message

    assert message is not None, "Expected message in response"
    assert hasattr(message, "content"), "Expected 'content' field in message"

    tool_calls = getattr(message, "tool_calls", [])
    assert not tool_calls, (
        f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}"
    )


@pytest.mark.asyncio
async def test_tool_call_with_temperature(client: openai.AsyncOpenAI):
    """
    Verify model produces valid tool or text output
    under non-deterministic sampling.
    """
    response = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=MESSAGES_CALC,
        tools=TOOLS,
        temperature=0.7,
        stream=False,
    )

    message = response.choices[0].message
    assert message is not None, "Expected non-empty message in response"
    assert message.tool_calls or message.content, (
        "Response missing both text and tool calls"
    )

    print(f"\nTool calls: {message.tool_calls}")
    print(f"Text: {message.content}")


@pytest.mark.asyncio
async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI):
    """Validate that tool call arguments adhere to their declared JSON schema."""
    response = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=MESSAGES_MULTIPLE_CALLS,
        tools=TOOLS,
        temperature=0.0,
    )

    calls = response.choices[0].message.tool_calls
    assert calls, "No tool calls produced"

    for call in calls:
        func_name = call.function.name
        args = json.loads(call.function.arguments)

        schema: dict[str, object] | None = None
        for tool_entry in TOOLS:
            function_def = tool_entry.get("function")
            if (
                function_def
                and isinstance(function_def, dict)
                and function_def.get("name") == func_name
            ):
                schema = function_def.get("parameters")
                break

        assert schema is not None, f"No matching tool schema found for {func_name}"

        jsonschema.validate(instance=args, schema=schema)


@pytest.mark.asyncio
async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI):
    """Test that temperature variation doesn't cause contradictory reasoning."""
    responses = []
    for temp in [0.0, 0.5, 1.0]:
        resp = await client.chat.completions.create(
            model=MODEL_NAME,
            messages=MESSAGES_CALC,
            tools=TOOLS,
            temperature=temp,
        )
        text = (resp.choices[0].message.content or "").strip()
        responses.append(text)

    # Compare fuzzy similarity between low- and mid-temperature outputs
    low_mid_sim = fuzz.ratio(responses[0], responses[1])
    assert low_mid_sim > 60, (
        f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)"
    )