diff --git a/.github/workflows/misc/model_list.json b/.github/workflows/misc/model_list.json index a45bebcc..c82d9671 100644 --- a/.github/workflows/misc/model_list.json +++ b/.github/workflows/misc/model_list.json @@ -1,6 +1,7 @@ { "models": [ "AngelSlim/Qwen3-32B_eagle3", + "AngelSlim/Qwen3-a3B_eagle3", "Anionex/Qwen3-1.7B-W4A8-V1", "ArthurZ/ilama-3.2-1B", "BAAI/bge-base-en-v1.5", @@ -207,10 +208,12 @@ "vllm-ascend/Qwen3-30B-A3B-Puring", "vllm-ascend/Qwen3-30B-A3B-W8A8", "vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning", + "vllm-ascend/Qwen3-30B-A3B-W8A8-QuaRot", "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8", "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8", "vllm-ascend/Qwen3-32B-W4A4", "vllm-ascend/Qwen3-32B-W8A8", + "vllm-ascend/Qwen3-32B-W8A8-QuaRot", "vllm-ascend/Qwen3-8B", "vllm-ascend/Qwen3-8B-W4A8", "vllm-ascend/Qwen3-8B-W8A8", diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index 6b7bd2a8..e009841d 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -54,7 +54,7 @@ e2e-singlecard: - name: tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py estimated_time: 1500 - name: tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py - estimated_time: 1800 + estimated_time: 600 - name: tests/e2e/singlecard/model_runner_v2/test_basic.py estimated_time: 80 is_skipped: true @@ -101,6 +101,8 @@ e2e-multicard-2-cards: estimated_time: 60 - name: tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py estimated_time: 223 + - name: tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py + estimated_time: 600 # Run the test in a separate step to avoid oom - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2 estimated_time: 100 diff --git a/tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py b/tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py new file mode 100644 index 00000000..cf4bbd38 --- /dev/null +++ b/tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py @@ -0,0 +1,209 @@ +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +# Run `pytest tests/e2e/multicard/2-cards/spec_decode/test_quarot_eagle.py`. + +from __future__ import annotations + +import os +from typing import Any + +import pytest +from transformers import AutoTokenizer +from vllm import SamplingParams +from vllm.v1.metrics.reader import Counter, Vector + +from tests.e2e.conftest import VllmRunner + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +K = 4 # Number of speculative tokens +TOL = 0.06 # Absolute tolerance for acceptance comparison + + +# Here, the two selected models correspond to two scenarios. +# The 32B draft model comes with its own embedding, +# while the 30B draft model shares the embedding of the target model. +MODELS: dict[str, dict] = { + "32B": { + "target": { + "float": "Qwen/Qwen3-32B", + "w8a8": "vllm-ascend/Qwen3-32B-W8A8-QuaRot", + }, + "draft": "RedHatAI/Qwen3-32B-speculator.eagle3", + }, + "30B": { + "target": { + "float": "Qwen/Qwen3-30B-A3B", + "w8a8": "vllm-ascend/Qwen3-30B-A3B-W8A8-QuaRot", + }, + "draft": "AngelSlim/Qwen3-a3B_eagle3", + }, +} + + +def _build_prompts(target_model: str) -> list[str]: + # These prompts were formed by taking one from each category of mt-bench. + # Although there are still some differences from the processing method of + # vllm serve bench, it does not affect this test. + # it is possible to directly take from mt-bench or further + # call vllm bench serve for direct testing later. + prompts = [ + { + "role": "user", + "content": "Compose an engaging travel blog post about a recent trip to Hawaii, " + "highlighting cultural experiences and must-see attractions.", + }, + { + "role": "user", + "content": "Pretend yourself to be Elon Musk in all the following conversations. " + "Speak like Elon Musk as much as possible. Why do we need to go to Mars?", + }, + { + "role": "user", + "content": "Imagine you are participating in a race with a group of people. " + "If you have just overtaken the second person, what's your current position? " + "Where is the person you just overtook?", + }, + { + "role": "user", + "content": "The vertices of a triangle are at points (0, 0), (-1, 1), and (3, 3). " + "What is the area of the triangle?", + }, + { + "role": "user", + "content": "Develop a Python program that reads all the text files under a directory " + "and returns top-5 words with the most number of occurrences.", + }, + { + "role": "user", + "content": "Evaluate the following movie reviews on a scale of 1 to 5, with 1 being very negative, " + "3 being neutral, and 5 being very positive:\n1. This movie released on Nov. 18, 2019, was phenomenal. " + "The cinematography, the acting, the plot - everything was top-notch.\n" + "2. Never before have I been so disappointed with a movie. The plot was predictable and the characters " + "were one-dimensional. In my opinion, this movie is the worst one to have been released in 2022.\n" + "3. The movie was okay. There were some parts I enjoyed, but there were also parts that felt lackluster. " + "This is a movie that was released in Feb 2018 and seems to be quite ordinary.\n" + "Return the answer as a JSON array of integers.", + }, + { + "role": "user", + "content": "In the field of quantum physics, what is superposition, " + "and how does it relate to the phenomenon of quantum entanglement?", + }, + { + "role": "user", + "content": "Provide insights into the correlation between economic indicators such as GDP, " + "inflation, and unemployment rates. Explain how fiscal and monetary policies affect those indicators.", + }, + ] + + tokenizer = AutoTokenizer.from_pretrained( + target_model, + trust_remote_code=True, + ) + + prompts_with_template: list[str] = [ + tokenizer.apply_chat_template( + [prompt], + tokenize=False, + add_generation_prompt=True, + ) + for prompt in prompts + ] + return prompts_with_template + + +def _run_model( + llm_kwargs: dict, + prompts: list[str], + sampling_params: SamplingParams, +) -> list[Any]: + with VllmRunner(**llm_kwargs) as llm: + _ = llm.generate(prompts, sampling_params) + metrics = llm.model.get_metrics() + + return metrics + + +def _compute_acceptance(metrics: list[Any]) -> list[float | int]: + num_drafts = 0 + num_accepted_tokens_per_pos = [0] * K + + for metric in metrics: + if metric.name == "vllm:spec_decode_num_drafts": + assert isinstance(metric, Counter) + num_drafts += metric.value + + elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + assert isinstance(metric, Vector) + for i, v in enumerate(metric.values): + num_accepted_tokens_per_pos[i] += v + + acceptance_per_pos = [ + num_accepted_tokens / num_drafts if num_drafts > 0 else 0.0 + for num_accepted_tokens in num_accepted_tokens_per_pos + ] + + return acceptance_per_pos + + +@pytest.mark.parametrize("model", ["32B", "30B"]) +def test_quarot_eagle_acceptance_tp2(model: str): + target_model = MODELS[model]["target"]["float"] + draft_model = MODELS[model]["draft"] + + prompts = _build_prompts(target_model) + + sampling_params = SamplingParams( + temperature=0, + ignore_eos=False, + max_tokens=512, + ) + + llm_kwargs = dict( + model_name=target_model, + enforce_eager=True, + max_model_len=4096, + disable_log_stats=False, + tensor_parallel_size=2, + distributed_executor_backend="mp", + gpu_memory_utilization=0.9, + speculative_config={ + "enforce_eager": True, + "method": "eagle3", + "model": draft_model, + "num_speculative_tokens": K, + }, + ) + + # Run the float model and the quarot model, + # and then compare their acceptance rates at each position. + ref_metrics = _run_model(llm_kwargs, prompts, sampling_params) + ref_acceptance = _compute_acceptance(ref_metrics) + + llm_kwargs["model_name"] = MODELS[model]["target"]["w8a8"] + llm_kwargs["quantization"] = "ascend" + + quarot_metrics = _run_model(llm_kwargs, prompts, sampling_params) + quarot_acceptance = _compute_acceptance(quarot_metrics) + + match = all(abs(i - j) <= TOL for i, j in zip(ref_acceptance, quarot_acceptance)) + + assert match, ( + f"\nref_acceptance_per_pos: {[round(_, 4) for _ in ref_acceptance]}" + f"\nquarot_acceptance_per_pos: {[round(_, 4) for _ in quarot_acceptance]}" + )