[BugFix][Fusion] Patch compile backend to make fusion available (#5308)
Currently, the vllm pr: https://github.com/vllm-project/vllm/pull/24252
is causing operator fusion to fail, which can be mitigated by patching
the backend. Once the problem is completely resolved, I will submit a
new pull request to remove the patch.
- vLLM version: release/v0.13.0
- vLLM main:
5fbfa8d9ef
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
@@ -45,6 +45,20 @@ def test_models_output_between_eager_and_aclgraph(
|
|||||||
"The capital of France is", "The future of AI is"
|
"The capital of France is", "The future of AI is"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
vllm_aclgraph_qwen_answers = [
|
||||||
|
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I want to know if there are any",
|
||||||
|
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
|
||||||
|
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
|
||||||
|
' not just a technological frontier but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
|
||||||
|
]
|
||||||
|
|
||||||
|
vllm_aclgraph_ds_answers = [
|
||||||
|
'\nI am a 20 year old student from the UK. I am currently studying for a degree in English Literature and Creative Writing. I have a passion',
|
||||||
|
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
|
||||||
|
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
|
||||||
|
' here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is here.\nThe future of AI is'
|
||||||
|
]
|
||||||
|
|
||||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
@@ -54,15 +68,6 @@ def test_models_output_between_eager_and_aclgraph(
|
|||||||
) as runner:
|
) as runner:
|
||||||
vllm_aclgraph_outputs = runner.model.generate(
|
vllm_aclgraph_outputs = runner.model.generate(
|
||||||
prompts, sampling_params)
|
prompts, sampling_params)
|
||||||
|
|
||||||
with VllmRunner(
|
|
||||||
model,
|
|
||||||
max_model_len=1024,
|
|
||||||
enforce_eager=True,
|
|
||||||
quantization="ascend",
|
|
||||||
) as runner:
|
|
||||||
vllm_eager_outputs = runner.model.generate(prompts,
|
|
||||||
sampling_params)
|
|
||||||
else:
|
else:
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
@@ -70,23 +75,16 @@ def test_models_output_between_eager_and_aclgraph(
|
|||||||
) as runner:
|
) as runner:
|
||||||
vllm_aclgraph_outputs = runner.model.generate(
|
vllm_aclgraph_outputs = runner.model.generate(
|
||||||
prompts, sampling_params)
|
prompts, sampling_params)
|
||||||
|
|
||||||
with VllmRunner(
|
|
||||||
model,
|
|
||||||
max_model_len=1024,
|
|
||||||
enforce_eager=True,
|
|
||||||
) as runner:
|
|
||||||
vllm_eager_outputs = runner.model.generate(prompts,
|
|
||||||
sampling_params)
|
|
||||||
vllm_aclgraph_outputs_list = []
|
vllm_aclgraph_outputs_list = []
|
||||||
for output in vllm_aclgraph_outputs:
|
for output in vllm_aclgraph_outputs:
|
||||||
vllm_aclgraph_outputs_list.append(
|
vllm_aclgraph_outputs_list.append(
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
([output.outputs[0].index], output.outputs[0].text))
|
||||||
|
|
||||||
vllm_eager_outputs_list = []
|
vllm_eager_outputs_list = ([
|
||||||
for output in vllm_eager_outputs:
|
([0], answer) for answer in vllm_aclgraph_ds_answers
|
||||||
vllm_eager_outputs_list.append(
|
] if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8" else [
|
||||||
(output.outputs[0].index, output.outputs[0].text))
|
([0], answer) for answer in vllm_aclgraph_qwen_answers
|
||||||
|
])
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=vllm_eager_outputs_list,
|
outputs_0_lst=vllm_eager_outputs_list,
|
||||||
@@ -134,7 +132,7 @@ def test_models_output_between_eager_and_full_decode_only(
|
|||||||
]
|
]
|
||||||
vllm_aclgraph_qwen_answers = [
|
vllm_aclgraph_qwen_answers = [
|
||||||
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
|
' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
|
||||||
" \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
|
' \n\nTo solve this problem, we can use the following approach: Let $ABCD$ be a unit square with coordinates $A(0,0), B',
|
||||||
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
|
' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ def test_models_with_xlite_decode_only(
|
|||||||
vllm_xlite_answers = [
|
vllm_xlite_answers = [
|
||||||
"Hello, my name is Lina. I'm a 22-year-old student from China.",
|
"Hello, my name is Lina. I'm a 22-year-old student from China.",
|
||||||
'The president of the United States is the same as the president of the United Nations. This is because the president',
|
'The president of the United States is the same as the president of the United Nations. This is because the president',
|
||||||
'The capital of France is Paris. The capital of Italy is Rome. The capital of Spain is Madrid',
|
'The capital of France is Paris. The capital of France is also the capital of the French Republic.',
|
||||||
'The future of AI is not just a technological challenge but a profound transformation of how we live, work'
|
'The future of AI is not just a technological challenge but a profound transformation of how we live, work'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -106,6 +106,20 @@
|
|||||||
# Future Plan:
|
# Future Plan:
|
||||||
# Remove this patch when vLLM merge the PR.
|
# Remove this patch when vLLM merge the PR.
|
||||||
#
|
#
|
||||||
|
# ** 7. File: platform/patch_compile_backend.py**
|
||||||
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
# 1. `vllm.compilation.backends.PiecewiseCompileInterpreter`
|
||||||
|
# `vllm.compilation.piecewise_backend.PiecewiseBackend`
|
||||||
|
# Why:
|
||||||
|
# vllm removed the compile graph for general shape, which caused operator fusion to fail.
|
||||||
|
# This issue affects the performance of model inference on Ascend.
|
||||||
|
# How:
|
||||||
|
# recover the compiled graph for dynamic_shape in PiecewiseBackend.
|
||||||
|
# Related PR (if no, explain why):
|
||||||
|
# https://github.com/vllm-project/vllm/pull/24252
|
||||||
|
# Future Plan:
|
||||||
|
# Remove this patch when fix the problem.
|
||||||
|
#
|
||||||
# * Worker Patch:
|
# * Worker Patch:
|
||||||
# ===============
|
# ===============
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import vllm_ascend.patch.platform.patch_compile_backend # noqa
|
||||||
import vllm_ascend.patch.platform.patch_distributed # noqa
|
import vllm_ascend.patch.platform.patch_distributed # noqa
|
||||||
import vllm_ascend.patch.platform.patch_ec_connector # noqa
|
import vllm_ascend.patch.platform.patch_ec_connector # noqa
|
||||||
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
||||||
|
|||||||
235
vllm_ascend/patch/platform/patch_compile_backend.py
Normal file
235
vllm_ascend/patch/platform/patch_compile_backend.py
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
from collections.abc import Callable
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.fx as fx
|
||||||
|
import vllm.compilation.backends
|
||||||
|
import vllm.compilation.piecewise_backend
|
||||||
|
from torch._dispatch.python import enable_python_dispatcher
|
||||||
|
from vllm.compilation.backends import VllmBackend
|
||||||
|
from vllm.compilation.counter import compilation_counter
|
||||||
|
from vllm.compilation.piecewise_backend import RangeEntry
|
||||||
|
from vllm.config import CUDAGraphMode, VllmConfig
|
||||||
|
from vllm.config.utils import Range
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class AscendPiecewiseCompileInterpreter(torch.fx.Interpreter):
|
||||||
|
"""Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
|
||||||
|
It runs the given graph with fake inputs, and compile some
|
||||||
|
submodules specified by `compile_submod_names` with the given
|
||||||
|
compilation configs.
|
||||||
|
|
||||||
|
NOTE: the order in `compile_submod_names` matters, because
|
||||||
|
it will be used to determine the order of the compiled piecewise
|
||||||
|
graphs. The first graph will handle logging, and the last graph
|
||||||
|
has some special cudagraph output handling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
module: torch.fx.GraphModule,
|
||||||
|
compile_submod_names: list[str],
|
||||||
|
vllm_config: VllmConfig,
|
||||||
|
vllm_backend: "VllmBackend",
|
||||||
|
):
|
||||||
|
super().__init__(module)
|
||||||
|
from torch._guards import detect_fake_mode
|
||||||
|
|
||||||
|
self.fake_mode = detect_fake_mode()
|
||||||
|
self.compile_submod_names = compile_submod_names
|
||||||
|
self.compilation_config = vllm_config.compilation_config
|
||||||
|
self.vllm_config = vllm_config
|
||||||
|
self.vllm_backend = vllm_backend
|
||||||
|
# When True, it annoyingly dumps the torch.fx.Graph on errors.
|
||||||
|
self.extra_traceback = False
|
||||||
|
|
||||||
|
def run(self, *args):
|
||||||
|
# maybe instead just assert inputs are fake?
|
||||||
|
fake_args = [
|
||||||
|
self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
|
||||||
|
for t in args
|
||||||
|
]
|
||||||
|
with self.fake_mode, enable_python_dispatcher():
|
||||||
|
return super().run(*fake_args)
|
||||||
|
|
||||||
|
def call_module(
|
||||||
|
self,
|
||||||
|
target: torch.fx.node.Target,
|
||||||
|
args: tuple[torch.fx.node.Argument, ...],
|
||||||
|
kwargs: dict[str, Any],
|
||||||
|
) -> Any:
|
||||||
|
assert isinstance(target, str)
|
||||||
|
|
||||||
|
output = super().call_module(target, args, kwargs)
|
||||||
|
|
||||||
|
if target in self.compile_submod_names:
|
||||||
|
index = self.compile_submod_names.index(target)
|
||||||
|
submod = self.fetch_attr(target)
|
||||||
|
|
||||||
|
sym_shape_indices = [
|
||||||
|
i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
|
||||||
|
]
|
||||||
|
max_num_batched_tokens = self.vllm_config.scheduler_config.max_num_batched_tokens
|
||||||
|
r1 = Range(start=1, end=max_num_batched_tokens)
|
||||||
|
compiled_graph_for_dynamic_shape = (
|
||||||
|
self.vllm_backend.compiler_manager.compile(
|
||||||
|
submod,
|
||||||
|
args,
|
||||||
|
self.vllm_backend.inductor_config,
|
||||||
|
self.compilation_config,
|
||||||
|
graph_index=index,
|
||||||
|
num_graphs=len(self.compile_submod_names),
|
||||||
|
compile_range=r1,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Lazy import here to avoid circular import
|
||||||
|
from vllm.compilation.piecewise_backend import PiecewiseBackend
|
||||||
|
|
||||||
|
piecewise_backend = PiecewiseBackend(
|
||||||
|
submod,
|
||||||
|
self.vllm_config,
|
||||||
|
index,
|
||||||
|
len(self.compile_submod_names),
|
||||||
|
sym_shape_indices,
|
||||||
|
compiled_graph_for_dynamic_shape,
|
||||||
|
self.vllm_backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
if (self.compilation_config.cudagraph_mode.
|
||||||
|
has_piecewise_cudagraphs() and
|
||||||
|
not self.compilation_config.use_inductor_graph_partition):
|
||||||
|
# We're using Dynamo-based piecewise splitting, so we wrap
|
||||||
|
# the whole subgraph with a static graph wrapper.
|
||||||
|
from vllm.compilation.cuda_graph import CUDAGraphOptions
|
||||||
|
|
||||||
|
# resolve the static graph wrapper class (e.g. CUDAGraphWrapper
|
||||||
|
# class) as platform dependent.
|
||||||
|
static_graph_wrapper_class = resolve_obj_by_qualname(
|
||||||
|
current_platform.get_static_graph_wrapper_cls())
|
||||||
|
|
||||||
|
# Always assign PIECEWISE runtime mode to the
|
||||||
|
# CUDAGraphWrapper for piecewise_backend, to distinguish
|
||||||
|
# it from the FULL cudagraph runtime mode, no matter it
|
||||||
|
# is wrapped on a full or piecewise fx graph.
|
||||||
|
self.module.__dict__[target] = static_graph_wrapper_class(
|
||||||
|
runnable=piecewise_backend,
|
||||||
|
vllm_config=self.vllm_config,
|
||||||
|
runtime_mode=CUDAGraphMode.PIECEWISE,
|
||||||
|
cudagraph_options=CUDAGraphOptions(
|
||||||
|
debug_log_enable=piecewise_backend.is_first_graph,
|
||||||
|
gc_disable=not piecewise_backend.is_first_graph,
|
||||||
|
weak_ref_output=piecewise_backend.is_last_graph,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.module.__dict__[target] = piecewise_backend
|
||||||
|
|
||||||
|
compilation_counter.num_piecewise_capturable_graphs_seen += 1
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class AscendPiecewiseBackend:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
graph: fx.GraphModule,
|
||||||
|
vllm_config: VllmConfig,
|
||||||
|
piecewise_compile_index: int,
|
||||||
|
total_piecewise_compiles: int,
|
||||||
|
sym_shape_indices: list[int],
|
||||||
|
compiled_graph_for_general_shape: Callable,
|
||||||
|
vllm_backend: VllmBackend,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
The backend for piecewise compilation.
|
||||||
|
It mainly handles the compilation of static shapes and
|
||||||
|
dispatching based on runtime shape.
|
||||||
|
|
||||||
|
We will compile `self.graph` once for the general shape,
|
||||||
|
and then compile for different shapes specified in
|
||||||
|
`compilation_config.compile_sizes`.
|
||||||
|
"""
|
||||||
|
self.compiled_graph_for_general_shape = compiled_graph_for_general_shape
|
||||||
|
self.graph = graph
|
||||||
|
self.vllm_config = vllm_config
|
||||||
|
self.compilation_config = vllm_config.compilation_config
|
||||||
|
self.piecewise_compile_index = piecewise_compile_index
|
||||||
|
self.total_piecewise_compiles = total_piecewise_compiles
|
||||||
|
self.vllm_backend = vllm_backend
|
||||||
|
|
||||||
|
self.is_first_graph = piecewise_compile_index == 0
|
||||||
|
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
|
||||||
|
|
||||||
|
self.is_full_graph = total_piecewise_compiles == 1
|
||||||
|
self.is_encoder_compilation = vllm_backend.is_encoder
|
||||||
|
|
||||||
|
self.compile_ranges = self.compilation_config.get_compile_ranges()
|
||||||
|
if self.is_encoder_compilation:
|
||||||
|
# For encoder compilation we use the max int32 value
|
||||||
|
# to set the upper bound of the compile ranges
|
||||||
|
max_int32 = 2**31 - 1
|
||||||
|
last_compile_range = self.compile_ranges[-1]
|
||||||
|
assert (last_compile_range.end ==
|
||||||
|
vllm_config.scheduler_config.max_num_batched_tokens)
|
||||||
|
self.compile_ranges[-1] = Range(start=last_compile_range.start,
|
||||||
|
end=max_int32)
|
||||||
|
|
||||||
|
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
|
||||||
|
logger.debug_once(log_string)
|
||||||
|
|
||||||
|
self.compile_sizes = self.compilation_config.compile_sizes
|
||||||
|
log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}"
|
||||||
|
logger.debug_once(log_string)
|
||||||
|
|
||||||
|
self.sym_shape_indices = sym_shape_indices
|
||||||
|
|
||||||
|
# the entries for ranges that we need to either
|
||||||
|
self.range_entries: dict[Range, RangeEntry] = {}
|
||||||
|
|
||||||
|
# to_be_compiled_ranges tracks the remaining ranges to compile,
|
||||||
|
# and updates during the compilation process, so we need to copy it
|
||||||
|
self.to_be_compiled_ranges: set[Range] = set(self.compile_ranges)
|
||||||
|
|
||||||
|
# We only keep compilation management inside this class directly.
|
||||||
|
for size in self.compile_sizes:
|
||||||
|
range = Range(start=size, end=size)
|
||||||
|
if range not in self.compile_ranges:
|
||||||
|
self.range_entries[range] = RangeEntry(compile_range=range, )
|
||||||
|
self.to_be_compiled_ranges.add(range)
|
||||||
|
|
||||||
|
for range in self.compile_ranges:
|
||||||
|
self.range_entries[range] = RangeEntry(compile_range=range, )
|
||||||
|
|
||||||
|
def _find_range_for_shape(self, runtime_shape: int) -> Range | None:
|
||||||
|
# First we try to find the range entry for the concrete compile size
|
||||||
|
# If not found, we search for the range entry
|
||||||
|
# that contains the runtime shape.
|
||||||
|
if runtime_shape in self.compile_sizes:
|
||||||
|
return self.range_entries[Range(start=runtime_shape,
|
||||||
|
end=runtime_shape)]
|
||||||
|
else:
|
||||||
|
for range in self.compile_ranges:
|
||||||
|
if runtime_shape in range:
|
||||||
|
return self.range_entries[range]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __call__(self, *args) -> Any:
|
||||||
|
runtime_shape = args[self.sym_shape_indices[0]]
|
||||||
|
range_entry = self._find_range_for_shape(runtime_shape)
|
||||||
|
|
||||||
|
assert range_entry is not None, (
|
||||||
|
f"Shape out of considered range: {runtime_shape} "
|
||||||
|
"[1, max_num_batched_tokens]")
|
||||||
|
|
||||||
|
return self.compiled_graph_for_general_shape(*args)
|
||||||
|
|
||||||
|
|
||||||
|
vllm.compilation.backends.PiecewiseCompileInterpreter = AscendPiecewiseCompileInterpreter
|
||||||
|
vllm.compilation.piecewise_backend.PiecewiseBackend.__init__ = AscendPiecewiseBackend.__init__
|
||||||
|
vllm.compilation.piecewise_backend.PiecewiseBackend.__call__ = AscendPiecewiseBackend.__call__
|
||||||
Reference in New Issue
Block a user