First commit

2025-08-05 19:02:46 +08:00
parent 9efe891f99
commit 99fb9f5cb0
1412 changed files with 203615 additions and 0 deletions
--- a/vllm/compilation/init.py
+++ b/vllm/compilation/init.py
--- a/vllm/compilation/pycache/init.cpython-310.pyc
+++ b/vllm/compilation/pycache/init.cpython-310.pyc
--- a/vllm/compilation/pycache/backends.cpython-310.pyc
+++ b/vllm/compilation/pycache/backends.cpython-310.pyc
--- a/vllm/compilation/pycache/compile_context.cpython-310.pyc
+++ b/vllm/compilation/pycache/compile_context.cpython-310.pyc
--- a/vllm/compilation/pycache/decorators.cpython-310.pyc
+++ b/vllm/compilation/pycache/decorators.cpython-310.pyc
--- a/vllm/compilation/pycache/levels.cpython-310.pyc
+++ b/vllm/compilation/pycache/levels.cpython-310.pyc
--- a/vllm/compilation/pycache/wrapper.cpython-310.pyc
+++ b/vllm/compilation/pycache/wrapper.cpython-310.pyc
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -0,0 +1,269 @@
+import copy
+import operator
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.fx as fx
+
+from vllm.logger import init_logger
+
+from .compile_context import get_compile_context
+from .levels import CompilationLevel
+
+logger = init_logger(__name__)
+
+
+def fix_functionalization(graph: fx.Graph):
+    """
+    Rewrite the graph module to replace the pattern involving
+    torch._higher_order_ops.auto_functionalize.auto_functionalized
+    with a direct call to the inplace custom op.
+
+    # TODO: check if PyTorch nightly has fixed this issue
+    """
+
+    # debug code, if we want to see the graph before the transformation
+    # with open("before.py", "w") as f:
+    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
+
+    nodes_to_remove = []
+
+    for node in graph.nodes:
+        # Identify the auto_functionalized node
+        if node.op == 'call_function' and node.target == torch._higher_order_ops.auto_functionalize.auto_functionalized:  # noqa
+            if node.args[0] == torch.ops._C.rotary_embedding.default:
+                # manual replace for rotary_embedding
+
+                # Now, collect the arguments
+                kwargs = node.kwargs
+
+                query = kwargs['query']
+                mm_node = query.args[0].args[0]
+
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(torch.ops._C.rotary_embedding.default,
+                                        kwargs=kwargs)
+
+                # Remove the auto_functionalized node
+                # Since the node may have outputs, we need to handle its users
+                # Replace uses of the outputs (getitem nodes) with mm_node
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        # Remove the getitem node
+                        for getitem_user in list(user.users):
+                            if (getitem_user.op == 'call_function'
+                                    and getitem_user.target
+                                    == torch.ops.aten.slice_scatter.default):
+                                # Replace the uses of slice_scatter node
+                                # with mm_node
+                                getitem_user.replace_all_uses_with(mm_node)
+                                nodes_to_remove.append(getitem_user)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[0] == torch.ops._C.fused_add_rms_norm.default:
+                # manual replace for fused_add_rms_norm
+                # this is the most effective optimization for llama
+                # failing to do this will result in many unnecessary copies
+
+                kwargs = node.kwargs
+
+                input = kwargs['input']
+                residual = kwargs['residual']
+
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.fused_add_rms_norm.default, kwargs=kwargs)
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        # Remove the getitem node
+                        if user.args[1] == 1:
+                            replace_node = input
+                        elif user.args[1] == 2:
+                            replace_node = residual
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[0] == torch.ops._C.rms_norm.default:
+                # manual replace for rms_norm
+
+                kwargs = node.kwargs
+
+                input = kwargs['input']
+                out = kwargs['out']
+                weight = kwargs['weight']
+                epsilon = kwargs['epsilon']
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.rms_norm.default,
+                        args=(out, input, weight, epsilon),
+                    )
+
+                replace_node = out
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[0] == torch.ops._C.silu_and_mul.default:
+                # manual replace for silu_and_mul
+
+                kwargs = node.kwargs
+
+                input = kwargs['input']
+                out = kwargs['out']
+
+                # Create a new call to torch.ops._C.rotary_embedding.default
+                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.silu_and_mul.default,
+                        args=(out, input),
+                    )
+                replace_node = out
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+    # Remove the nodes all at once
+    for node in nodes_to_remove:
+        graph.erase_node(node)
+
+    # debug code, if we want to see the graph after the transformation
+    # with open("after.py", "w") as f:
+    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)
+
+
+def wrap_inductor(graph, example_inputs, additional_inductor_config):
+    from torch._inductor import config
+    current_config = config.shallow_copy_dict()
+    from torch._inductor.compile_fx import compile_fx
+
+    if additional_inductor_config is not None:
+        current_config.update(additional_inductor_config)
+    if current_config['post_grad_custom_post_pass'] is not None:
+        logger.warning(
+            "post_grad_custom_post_pass is already set in the config. "
+            "Overwriting it with the fix_functionalization")
+    current_config['post_grad_custom_post_pass'] = fix_functionalization
+    return compile_fx(graph, example_inputs, config_patches=current_config)
+
+
+def vllm_backend(
+        graph,
+        example_inputs,
+        additional_inductor_config: Optional[Dict] = None) -> Callable:
+
+    context = get_compile_context()
+    context = copy.deepcopy(context) if context is not None else []
+    sizes_to_specialize: List[int] = context
+
+    # flags for all the seen shapes, whether we need to specialize
+    runtime_shapes_to_compile_flags: Dict[Tuple[int, ...], bool] = {}
+
+    # if we need to specialize, the compiled graph for that shape
+    runtime_shapes_to_compiled_graph: Dict[Tuple[int, ...], Callable] = {}
+
+    # this is the first compilation, we will compile a graph with
+    # dynamic shape, as the caller will mark first dimension as dynamic
+    logger.info("Compiling a graph for general shapes")
+    graph_for_symbolic_shape = wrap_inductor(graph, example_inputs,
+                                             additional_inductor_config)
+
+    # TODO: Dynamo does not pass all dynamic shapes.
+    # Need to investigate why. It works now because all the dynamic
+    # shapes have the same value, and either of them can be used.
+    sym_shape_indices = [
+        i for i, x in enumerate(example_inputs) if isinstance(x, torch.SymInt)
+    ]
+
+    first_run = True
+
+    # this is the function we return to Dynamo to run finally
+    def compiled_graph_wrapper(*args):
+
+        runtime_shapes: Tuple[int,
+                              ...] = tuple(args[i] for i in sym_shape_indices)
+
+        nonlocal first_run
+        nonlocal runtime_shapes_to_compile_flags
+        nonlocal runtime_shapes_to_compiled_graph
+
+        if first_run:
+            # the first compilation is for profiling, we directly run it
+            first_run = False
+            return graph_for_symbolic_shape(*args)
+
+        if runtime_shapes not in runtime_shapes_to_compile_flags:
+            # we haven't seen this shape before
+            # query if we need to specialize for this shape
+            # we only specialize for the first dimension.
+            # TODO: investigate if any model needs to specialize
+            # beyond the first dimension
+            runtime_shapes_to_compile_flags[runtime_shapes] = runtime_shapes[
+                0] in sizes_to_specialize
+
+        if not runtime_shapes_to_compile_flags[runtime_shapes]:
+            # we don't need to specialize for this shape
+            return graph_for_symbolic_shape(*args)
+
+        if runtime_shapes not in runtime_shapes_to_compiled_graph:
+            # we need to specialize for this shape, and we haven't compiled
+            # compile the graph for this shape
+            logger.info("Compiling a graph for shapes %s", runtime_shapes)
+            runtime_shapes_to_compiled_graph[runtime_shapes] = wrap_inductor(
+                graph, args, additional_inductor_config)
+
+        return runtime_shapes_to_compiled_graph[runtime_shapes](*args)
+
+    return compiled_graph_wrapper
+
+
+def select_default_backend(level: int) -> Union[str, Callable]:
+    if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
+        backend = "eager"
+        return backend
+    assert level in [
+        CompilationLevel.INDUCTOR, CompilationLevel.INDUCTOR_MAX_AUTOTUNE
+    ], f"Invalid level {level}"
+
+    from vllm.compilation.backends import vllm_backend
+    from vllm.plugins import get_inductor_additional_configs
+    additional_configs = get_inductor_additional_configs()
+
+    if level == CompilationLevel.INDUCTOR_MAX_AUTOTUNE:
+        if "max_autotune" in additional_configs and not additional_configs[
+                "max_autotune"]:
+            logger.warning(
+                "max_autotune is disabled, but is overridden by level %s",
+                CompilationLevel.INDUCTOR_MAX_AUTOTUNE)
+        additional_configs['max_autotune'] = True
+
+    from functools import partial
+    backend = partial(vllm_backend,
+                      additional_inductor_config=additional_configs)
+
+    return backend
--- a/vllm/compilation/compile_context.py
+++ b/vllm/compilation/compile_context.py
@@ -0,0 +1,23 @@
+from contextlib import contextmanager
+from typing import Any
+
+_compile_context: Any = None
+
+
+def get_compile_context() -> Any:
+    """Get the current compile context."""
+    return _compile_context
+
+
+@contextmanager
+def set_compile_context(context: Any):
+    """A context manager that stores the current compile context,
+    usually it is a list of sizes to specialize.
+    """
+    global _compile_context
+    prev_context = _compile_context
+    _compile_context = context
+    try:
+        yield
+    finally:
+        _compile_context = prev_context
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -0,0 +1,113 @@
+import inspect
+from typing import Dict, List, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.levels import CompilationLevel
+from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.sequence import IntermediateTensors
+from vllm.utils import supports_dynamo
+
+
+def support_torch_compile(dynamic_arg_dims: Dict[str, Union[int, List[int]]]):
+    """
+    A decorator to add support for compiling the forward method of a class.
+
+    `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic
+    dimensions of the argument. The dynamic dimensions can be either a single
+    integer or a list of integers.
+
+    Depending on the value of arguments:
+
+    - if it is a single integer, the corresponding dimension of the argument
+        will be marked as dynamic.
+    - if it is `None`, ignored.
+    - if it is `IntermediateTensors`, all the tensors in the intermediate
+        tensors will be marked as dynamic.
+    - otherwise, it will raise an error.
+
+    NOTE: if an argument is `None`, it should always be passed as `None` during
+    the lifetime of the model, otherwise, it cannot be captured as a single
+    computation graph.
+    """
+
+    def cls_decorator_helper(cls: type):
+        # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
+        # to avoid too much indentation for `_support_torch_compile``
+        sig = inspect.signature(cls.forward)
+        for k in dynamic_arg_dims:
+            if k not in sig.parameters:
+                raise ValueError(
+                    f"Argument {k} not found in the forward method of {cls}")
+        return _support_torch_compile(cls, dynamic_arg_dims)
+
+    return cls_decorator_helper
+
+
+def _support_torch_compile(cls: type,
+                           dynamic_arg_dims: Dict[str, Union[int, List[int]]]):
+    """
+    A decorator to add support for compiling the forward method of a class.
+    """
+
+    # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+    # will handle the compilation, so we don't need to do anything here.
+    if envs.VLLM_TORCH_COMPILE_LEVEL in [
+            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
+    ] or not supports_dynamo():
+        return cls
+
+    # take care of method resolution order
+    # make sure super().__init__ is called on the base class
+    #  other than TorchCompileWrapperWithCustomDispatcher
+    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
+
+    old_init = cls.__init__
+
+    def __init__(self, *args, **kwargs):
+        old_init(self, *args, **kwargs)
+        TorchCompileWrapperWithCustomDispatcher.__init__(self)
+
+    cls.__init__ = __init__
+
+    def __call__(self, *args, **kwargs):
+        # torch.compiler.is_compiling() means we are inside the compilation
+        # e.g. TPU has the compilation logic in model runner, so we don't
+        # need to compile the model inside.
+        if torch.compiler.is_compiling():
+            return self.forward(*args, **kwargs)
+
+        # the first compilation needs to have dynamic shapes marked
+        if len(self.compiled_codes) < 1:
+            sig = inspect.signature(self.__class__.forward)
+            bound_args = sig.bind(self, *args, **kwargs)
+            bound_args.apply_defaults()
+            for k, dims in dynamic_arg_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    if isinstance(arg, torch.Tensor):
+                        torch._dynamo.mark_dynamic(arg, dims)
+                    elif isinstance(arg, IntermediateTensors):
+                        for tensor in arg.tensors.values():
+                            torch._dynamo.mark_dynamic(tensor, dims)
+                    else:
+                        raise ValueError(
+                            "Unsupported dynamic dimensions"
+                            f" {dims} for argument {k} with type {type(arg)}.")
+
+        # if we don't use custom dispatcher, we can directly call the
+        # compiled function and let torch.compile handle the dispatching,
+        # with the overhead of guard evaluation and recompilation.
+        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
+            return self.compiled_callable(*args, **kwargs)
+
+        # usually, capturing the model once is enough, and then we can
+        # dispatch to the compiled code directly, without going through
+        # the Dynamo guard mechanism.
+        with self.dispatch_to_code(0):
+            model_output = self.forward(*args, **kwargs)
+            return model_output
+
+    cls.__call__ = __call__
+    return cls
--- a/vllm/compilation/levels.py
+++ b/vllm/compilation/levels.py
@@ -0,0 +1,9 @@
+# constants for the levels of the compilation process
+
+
+class CompilationLevel:
+    NO_COMPILATION = 0
+    DYNAMO_AS_IS = 1
+    DYNAMO_ONCE = 2
+    INDUCTOR = 3
+    INDUCTOR_MAX_AUTOTUNE = 4
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -0,0 +1,102 @@
+import os
+import sys
+from abc import abstractmethod
+from contextlib import contextmanager
+from types import CodeType
+from typing import Callable, List, Optional
+
+import torch
+
+import vllm.envs as envs
+
+from .levels import CompilationLevel
+
+
+class TorchCompileWrapperWithCustomDispatcher:
+    """
+    A wrapper class for torch.compile, with a custom dispatch logic.
+    Subclasses should:
+    1. Implement the forward method
+    2. Implement the dispatch logic in the __call__ method
+        It can use `self.compiled_codes` to access the compiled bytecode,
+        and `with self.dispatch_to_code(index):` to dispatch to
+        the compiled code.
+    3. Implement the `__init__` method to determine how to call
+        `torch.compile` over the forward method.
+    """
+
+    def __init__(self, compiled_callable: Optional[Callable] = None):
+
+        if compiled_callable is None:
+            # default compilation settings
+            # compiling the forward method
+
+            # choose the compile backend
+
+            # if the user has set the backend, use it
+            from vllm.plugins import get_torch_compile_backend
+            backend = get_torch_compile_backend()
+            if backend is None:
+                from vllm.compilation.backends import select_default_backend
+                backend = select_default_backend(envs.VLLM_TORCH_COMPILE_LEVEL)
+
+            compiled_callable = torch.compile(
+                self.forward,
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend=backend)
+
+        self.compiled_callable = compiled_callable
+        self.original_code_object = self.__class__.forward.__code__
+        self.compiled_codes: List[CodeType] = []
+        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+
+        # read the env var to determine whether to use the custom dispatcher
+        # subclasses can use this to switch between the custom dispatcher
+        # and the default Dynamo guard mechanism.
+        self.use_custom_dispatcher: bool = \
+            envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.DYNAMO_ONCE
+
+    def __call__(self, *args, **kwargs):
+        """Implement the dispatch logic here, beyond the torch.compile level.
+        NOTE: this function can have additional arguments beyond the forward
+         method, for directly dispatching to the compiled code.
+        """
+        return self.compiled_callable(*args, **kwargs)
+
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        ...
+
+    def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
+        """Hook to save the compiled bytecode for direct execution."""
+        if old_code is not self.original_code_object:
+            return
+        # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
+        frame = sys._getframe()
+        while True:
+            frame = frame.f_back
+            code_name = frame.f_code.co_name
+            file_name = frame.f_code.co_filename.split(os.path.sep)[-1]
+            if code_name == "_compile" and file_name == "convert_frame.py":
+                break
+        frame = frame.f_locals["frame"]
+        assert frame.f_code == old_code
+
+        if frame.f_locals["self"] is not self:
+            return
+
+        self.compiled_codes.append(new_code)
+
+    @contextmanager
+    def dispatch_to_code(self, index: int):
+        """Context manager to dispatch to the compiled code.
+        Why does this work? Because Dynamo guarantees that the compiled
+        bytecode has exactly the same arguments, cell variables, and free
+        variables as the original code. Therefore we can directly switch
+        the code object in the function and call it.
+
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
+        """ # noqa
+        self.__class__.forward.__code__ = self.compiled_codes[index]
+        yield
+        self.__class__.forward.__code__ = self.original_code_object