xc-llm-ascend/tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py

# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# Run `pytest tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py`.

from __future__ import annotations

import os
from typing import Any

import pytest
from transformers import AutoTokenizer
from vllm import SamplingParams
from vllm.v1.metrics.reader import Counter, Vector

from tests.e2e.conftest import VllmRunner

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

K = 4  # Number of speculative tokens
TOL = 0.06  # Absolute tolerance for acceptance comparison


# Here, the two selected models correspond to two scenarios.
# The 32B draft model comes with its own embedding,
# while the 30B draft model shares the embedding of the target model.
MODELS: dict[str, dict] = {
    "32B": {
        "target": {
            "float": "Qwen/Qwen3-32B",
            "w8a8": "vllm-ascend/Qwen3-32B-W8A8-QuaRot",
        },
        "draft": "RedHatAI/Qwen3-32B-speculator.eagle3",
    },
    "30B": {
        "target": {
            "float": "Qwen/Qwen3-30B-A3B",
            "w8a8": "vllm-ascend/Qwen3-30B-A3B-W8A8-QuaRot",
        },
        "draft": "AngelSlim/Qwen3-a3B_eagle3",
    },
}


def _build_prompts(target_model: str) -> list[str]:
    # These prompts were formed by taking one from each category of mt-bench.
    # Although there are still some differences from the processing method of
    # vllm serve bench, it does not affect this test.
    # it is possible to directly take from mt-bench or further
    # call vllm bench serve for direct testing later.
    prompts = [
        {
            "role": "user",
            "content": "Compose an engaging travel blog post about a recent trip to Hawaii, "
            "highlighting cultural experiences and must-see attractions.",
        },
        {
            "role": "user",
            "content": "Pretend yourself to be Elon Musk in all the following conversations. "
            "Speak like Elon Musk as much as possible. Why do we need to go to Mars?",
        },
        {
            "role": "user",
            "content": "Imagine you are participating in a race with a group of people. "
            "If you have just overtaken the second person, what's your current position? "
            "Where is the person you just overtook?",
        },
        {
            "role": "user",
            "content": "The vertices of a triangle are at points (0, 0), (-1, 1), and (3, 3). "
            "What is the area of the triangle?",
        },
        {
            "role": "user",
            "content": "Develop a Python program that reads all the text files under a directory "
            "and returns top-5 words with the most number of occurrences.",
        },
        {
            "role": "user",
            "content": "Evaluate the following movie reviews on a scale of 1 to 5, with 1 being very negative, "
            "3 being neutral, and 5 being very positive:\n1. This movie released on Nov. 18, 2019, was phenomenal. "
            "The cinematography, the acting, the plot - everything was top-notch.\n"
            "2. Never before have I been so disappointed with a movie. The plot was predictable and the characters "
            "were one-dimensional. In my opinion, this movie is the worst one to have been released in 2022.\n"
            "3. The movie was okay. There were some parts I  enjoyed, but there were also parts that felt lackluster. "
            "This is a movie that was released in Feb 2018 and seems to be quite ordinary.\n"
            "Return the answer as a JSON array of integers.",
        },
        {
            "role": "user",
            "content": "In the field of quantum physics, what is superposition, "
            "and how does it relate to the phenomenon of quantum entanglement?",
        },
        {
            "role": "user",
            "content": "Provide insights into the correlation between economic indicators such as GDP, "
            "inflation, and unemployment rates. Explain how fiscal and monetary policies affect those indicators.",
        },
    ]

    tokenizer = AutoTokenizer.from_pretrained(
        target_model,
        trust_remote_code=True,
    )

    prompts_with_template: list[str] = [
        tokenizer.apply_chat_template(
            [prompt],
            tokenize=False,
            add_generation_prompt=True,
        )
        for prompt in prompts
    ]
    return prompts_with_template


def _run_model(
    llm_kwargs: dict,
    prompts: list[str],
    sampling_params: SamplingParams,
) -> list[Any]:
    with VllmRunner(**llm_kwargs) as llm:
        _ = llm.generate(prompts, sampling_params)
        metrics = llm.model.get_metrics()

    return metrics


def _compute_acceptance(metrics: list[Any]) -> list[float | int]:
    num_drafts = 0
    num_accepted_tokens_per_pos = [0] * K

    for metric in metrics:
        if metric.name == "vllm:spec_decode_num_drafts":
            assert isinstance(metric, Counter)
            num_drafts += metric.value

        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
            assert isinstance(metric, Vector)
            for i, v in enumerate(metric.values):
                num_accepted_tokens_per_pos[i] += v

    acceptance_per_pos = [
        num_accepted_tokens / num_drafts if num_drafts > 0 else 0.0
        for num_accepted_tokens in num_accepted_tokens_per_pos
    ]

    return acceptance_per_pos


@pytest.mark.parametrize("model", ["32B", "30B"])
def test_quarot_eagle_acceptance_tp2(model: str):
    target_model = MODELS[model]["target"]["float"]
    draft_model = MODELS[model]["draft"]

    prompts = _build_prompts(target_model)

    sampling_params = SamplingParams(
        temperature=0,
        ignore_eos=False,
        max_tokens=512,
    )

    llm_kwargs = dict(
        model_name=target_model,
        enforce_eager=True,
        max_model_len=4096,
        disable_log_stats=False,
        tensor_parallel_size=2,
        distributed_executor_backend="mp",
        gpu_memory_utilization=0.9,
        speculative_config={
            "enforce_eager": True,
            "method": "eagle3",
            "model": draft_model,
            "num_speculative_tokens": K,
        },
    )

    # Run the float model and the quarot model,
    # and then compare their acceptance rates at each position.
    ref_metrics = _run_model(llm_kwargs, prompts, sampling_params)
    ref_acceptance = _compute_acceptance(ref_metrics)

    llm_kwargs["model_name"] = MODELS[model]["target"]["w8a8"]
    llm_kwargs["quantization"] = "ascend"

    quarot_metrics = _run_model(llm_kwargs, prompts, sampling_params)
    quarot_acceptance = _compute_acceptance(quarot_metrics)

    match = all(abs(i - j) <= TOL for i, j in zip(ref_acceptance, quarot_acceptance))

    assert match, (
        f"\nref_acceptance_per_pos: {[round(_, 4) for _ in ref_acceptance]}"
        f"\nquarot_acceptance_per_pos: {[round(_, 4) for _ in quarot_acceptance]}"
    )
[Test][Feature] Add e2e test for QuaRot model with eagle3 (#7128) ### What this PR does / why we need it? Add an e2e test for QuaRot model with eagle3 that runs both the QuaRot model and the float model, and then compares their acceptance rates. The QuaRot model adapting eagle3 PR(#6914, #7038) - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d Signed-off-by: zhaomingyu <zhaomingyu13@h-partners.com> 2026-03-16 15:35:55 +08:00			`# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			# Run `pytest tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py`.

			`from __future__ import annotations`

			`import os`
			`from typing import Any`

			`import pytest`
			`from transformers import AutoTokenizer`
			`from vllm import SamplingParams`
			`from vllm.v1.metrics.reader import Counter, Vector`

			`from tests.e2e.conftest import VllmRunner`

			`os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"`

			`K = 4 # Number of speculative tokens`
			`TOL = 0.06 # Absolute tolerance for acceptance comparison`


			`# Here, the two selected models correspond to two scenarios.`
			`# The 32B draft model comes with its own embedding,`
			`# while the 30B draft model shares the embedding of the target model.`
			`MODELS: dict[str, dict] = {`
			`"32B": {`
			`"target": {`
			`"float": "Qwen/Qwen3-32B",`
			`"w8a8": "vllm-ascend/Qwen3-32B-W8A8-QuaRot",`
			`},`
			`"draft": "RedHatAI/Qwen3-32B-speculator.eagle3",`
			`},`
			`"30B": {`
			`"target": {`
			`"float": "Qwen/Qwen3-30B-A3B",`
			`"w8a8": "vllm-ascend/Qwen3-30B-A3B-W8A8-QuaRot",`
			`},`
			`"draft": "AngelSlim/Qwen3-a3B_eagle3",`
			`},`
			`}`


			`def _build_prompts(target_model: str) -> list[str]:`
			`# These prompts were formed by taking one from each category of mt-bench.`
			`# Although there are still some differences from the processing method of`
			`# vllm serve bench, it does not affect this test.`
			`# it is possible to directly take from mt-bench or further`
			`# call vllm bench serve for direct testing later.`
			`prompts = [`
			`{`
			`"role": "user",`
			`"content": "Compose an engaging travel blog post about a recent trip to Hawaii, "`
			`"highlighting cultural experiences and must-see attractions.",`
			`},`
			`{`
			`"role": "user",`
			`"content": "Pretend yourself to be Elon Musk in all the following conversations. "`
			`"Speak like Elon Musk as much as possible. Why do we need to go to Mars?",`
			`},`
			`{`
			`"role": "user",`
			`"content": "Imagine you are participating in a race with a group of people. "`
			`"If you have just overtaken the second person, what's your current position? "`
			`"Where is the person you just overtook?",`
			`},`
			`{`
			`"role": "user",`
			`"content": "The vertices of a triangle are at points (0, 0), (-1, 1), and (3, 3). "`
			`"What is the area of the triangle?",`
			`},`
			`{`
			`"role": "user",`
			`"content": "Develop a Python program that reads all the text files under a directory "`
			`"and returns top-5 words with the most number of occurrences.",`
			`},`
			`{`
			`"role": "user",`
			`"content": "Evaluate the following movie reviews on a scale of 1 to 5, with 1 being very negative, "`
			`"3 being neutral, and 5 being very positive:\n1. This movie released on Nov. 18, 2019, was phenomenal. "`
			`"The cinematography, the acting, the plot - everything was top-notch.\n"`
			`"2. Never before have I been so disappointed with a movie. The plot was predictable and the characters "`
			`"were one-dimensional. In my opinion, this movie is the worst one to have been released in 2022.\n"`
			`"3. The movie was okay. There were some parts I enjoyed, but there were also parts that felt lackluster. "`
			`"This is a movie that was released in Feb 2018 and seems to be quite ordinary.\n"`
			`"Return the answer as a JSON array of integers.",`
			`},`
			`{`
			`"role": "user",`
			`"content": "In the field of quantum physics, what is superposition, "`
			`"and how does it relate to the phenomenon of quantum entanglement?",`
			`},`
			`{`
			`"role": "user",`
			`"content": "Provide insights into the correlation between economic indicators such as GDP, "`
			`"inflation, and unemployment rates. Explain how fiscal and monetary policies affect those indicators.",`
			`},`
			`]`

			`tokenizer = AutoTokenizer.from_pretrained(`
			`target_model,`
			`trust_remote_code=True,`
			`)`

			`prompts_with_template: list[str] = [`
			`tokenizer.apply_chat_template(`
			`[prompt],`
			`tokenize=False,`
			`add_generation_prompt=True,`
			`)`
			`for prompt in prompts`
			`]`
			`return prompts_with_template`


			`def _run_model(`
			`llm_kwargs: dict,`
			`prompts: list[str],`
			`sampling_params: SamplingParams,`
			`) -> list[Any]:`
			`with VllmRunner(**llm_kwargs) as llm:`
			`_ = llm.generate(prompts, sampling_params)`
			`metrics = llm.model.get_metrics()`

			`return metrics`


			`def _compute_acceptance(metrics: list[Any]) -> list[float \| int]:`
			`num_drafts = 0`
			`num_accepted_tokens_per_pos = [0] * K`

			`for metric in metrics:`
			`if metric.name == "vllm:spec_decode_num_drafts":`
			`assert isinstance(metric, Counter)`
			`num_drafts += metric.value`

			`elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":`
			`assert isinstance(metric, Vector)`
			`for i, v in enumerate(metric.values):`
			`num_accepted_tokens_per_pos[i] += v`

			`acceptance_per_pos = [`
			`num_accepted_tokens / num_drafts if num_drafts > 0 else 0.0`
			`for num_accepted_tokens in num_accepted_tokens_per_pos`
			`]`

			`return acceptance_per_pos`


			`@pytest.mark.parametrize("model", ["32B", "30B"])`
			`def test_quarot_eagle_acceptance_tp2(model: str):`
			`target_model = MODELS[model]["target"]["float"]`
			`draft_model = MODELS[model]["draft"]`

			`prompts = _build_prompts(target_model)`

			`sampling_params = SamplingParams(`
			`temperature=0,`
			`ignore_eos=False,`
			`max_tokens=512,`
			`)`

			`llm_kwargs = dict(`
			`model_name=target_model,`
			`enforce_eager=True,`
			`max_model_len=4096,`
			`disable_log_stats=False,`
			`tensor_parallel_size=2,`
			`distributed_executor_backend="mp",`
			`gpu_memory_utilization=0.9,`
			`speculative_config={`
			`"enforce_eager": True,`
			`"method": "eagle3",`
			`"model": draft_model,`
			`"num_speculative_tokens": K,`
			`},`
			`)`

			`# Run the float model and the quarot model,`
			`# and then compare their acceptance rates at each position.`
			`ref_metrics = _run_model(llm_kwargs, prompts, sampling_params)`
			`ref_acceptance = _compute_acceptance(ref_metrics)`

			`llm_kwargs["model_name"] = MODELS[model]["target"]["w8a8"]`
			`llm_kwargs["quantization"] = "ascend"`

			`quarot_metrics = _run_model(llm_kwargs, prompts, sampling_params)`
			`quarot_acceptance = _compute_acceptance(quarot_metrics)`

			`match = all(abs(i - j) <= TOL for i, j in zip(ref_acceptance, quarot_acceptance))`

			`assert match, (`
			`f"\nref_acceptance_per_pos: {[round(_, 4) for _ in ref_acceptance]}"`
			`f"\nquarot_acceptance_per_pos: {[round(_, 4) for _ in quarot_acceptance]}"`
			`)`