[Model] Support DeepSeek-V4

2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions
--- a/vllm_mlu/compilation/init.py
+++ b/vllm_mlu/compilation/init.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
--- a/vllm_mlu/compilation/fix_functionalization.py
+++ b/vllm_mlu/compilation/fix_functionalization.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+# SPDX-License-Identifier: Apache-2.0
+
+import operator
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+from vllm.compilation.vllm_inductor_pass import VllmInductorPass
+from vllm.platforms import current_platform
+from vllm.logger import init_logger
+
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fx_utils import is_func
+
+from vllm_mlu.mlu_hijack_utils import MluHijackObject
+
+logger = init_logger(__name__)
+
+
+class FixFunctionalizationPass_MluHijack(FixFunctionalizationPass):
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: torch.fx.Graph):
+        # XPU does not support auto-functionalization yet.
+        # Will enable this when switch to vllm-xpu-kernels.
+        if current_platform.is_xpu():
+            logger.debug(
+                "XPU platform does not support fix functionalizationpass currently."
+            )
+            return
+
+        self.nodes_to_remove: list[torch.fx.Node] = []
+        count = 0
+        for node in graph.nodes:
+            '''
+            =============================
+            Modify by vllm_mlu
+            =============================
+            @brief: skip custom op on mlu
+            '''
+            if current_platform.is_out_of_tree():
+                continue # skip the count on mlu
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+            if not is_func(node, auto_functionalized):
+                continue  # Avoid deep if-elif nesting
+
+            kwargs = node.kwargs
+            at_target = node.args[0]
+
+            if at_target == torch.ops._C.rotary_embedding.default:
+                query = kwargs["query"]
+                key = kwargs["key"]
+                getitem_nodes = self.getitem_users(node)
+
+                if (
+                    is_func(query, operator.getitem)
+                    and is_func(key, operator.getitem)
+                    and query.args[0] == key.args[0]
+                    and is_func(query.args[0], torch.ops.aten.split_with_sizes.default)
+                    and all(
+                        is_func(user, torch.ops.aten.slice_scatter.default)
+                        for getitem_node in getitem_nodes.values()
+                        for user in getitem_node.users
+                    )
+                ):
+                    # Pattern where query and key are slices of an mm_node.
+                    # While functionalized, results at [1] and [2] are scattered
+                    # back into mm_node. So after de-functionalization, we can
+                    # just use mm_node directly.
+
+                    mm_node = query.args[0].args[0]
+                    for user in getitem_nodes.values():
+                        for user_of_getitem in user.users:
+                            if is_func(
+                                user_of_getitem, torch.ops.aten.slice_scatter.default
+                            ):
+                                user_of_getitem.replace_all_uses_with(mm_node)
+                                self._remove(user_of_getitem)
+                        self._remove(user)
+
+                    self.insert_defunctionalized(graph, node)
+                    self._remove(node)
+
+                else:
+                    # Directly replace the auto_functionalize(rotary_embedding)
+                    # with the inplace rotary_embedding. In theory, we shouldn't
+                    # do this blindly, but in practice in vLLM it's ok. The best
+                    # solution is to use auto_functionalization_v2 and then use
+                    # inductor's builtin defunctionalization (reinplacing) pass.
+                    mutated_args = {1: "query", 2: "key"}
+                    self.defunctionalize(graph, node, mutated_args)
+
+            # rms_norm replacements avoid the most copies for LLaMa.
+            elif at_target == torch.ops._C.fused_add_rms_norm.default:
+                mutated_args = {1: "input", 2: "residual"}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default:  # noqa: E501
+                mutated_args = {1: "result", 2: "residual"}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default:  # noqa: E501
+                mutated_args = {1: "result", 2: "scale", 3: "residual"}
+                self.defunctionalize(graph, node, mutated_args)
+            elif at_target in [
+                torch.ops._C.rms_norm.default,
+                torch.ops._C.rms_norm_static_fp8_quant.default,
+            ]:
+                mutated_args = {1: "result"}
+                self.defunctionalize(graph, node, mutated_args)
+            # For some reason we need to specify the args for both
+            # silu_and_mul and silu_and_mul_quant. The kwargs
+            # pathway gets the wrong answer.
+            elif at_target == torch.ops._C.silu_and_mul.default:
+                mutated_args = {1: "result"}
+                self.defunctionalize(
+                    graph, node, mutated_args, args=("result", "input")
+                )
+            elif at_target == torch.ops._C.silu_and_mul_quant.default:
+                mutated_args = {1: "result"}
+                self.defunctionalize(
+                    graph, node, mutated_args, args=("result", "input", "scale")
+                )
+            elif (
+                hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")
+                and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default
+            ):
+                mutated_args = {1: "result", 2: "result_block_scale"}
+                self.defunctionalize(
+                    graph,
+                    node,
+                    mutated_args,
+                    args=(
+                        "result",
+                        "result_block_scale",
+                        "input",
+                        "input_global_scale",
+                    ),
+                )
+            # Defunctionalize fused_qk_norm_rope to remove higher-order wrapper.
+            elif at_target == torch.ops._C.fused_qk_norm_rope.default:
+                mutated_args = {1: "qkv"}
+                args = (
+                    "qkv",
+                    "num_heads_q",
+                    "num_heads_k",
+                    "num_heads_v",
+                    "head_dim",
+                    "eps",
+                    "q_weight",
+                    "k_weight",
+                    "cos_sin_cache",
+                    "is_neox",
+                    "position_ids",
+                )
+                self.defunctionalize(graph, node, mutated_args=mutated_args, args=args)
+            else:
+                continue  # skip the count
+
+            count += 1
+
+        self.dump_graph(graph, "before_cleanup")
+
+        # Remove the nodes all at once
+        count_removed = len(self.nodes_to_remove)
+        for node in self.nodes_to_remove:
+            graph.erase_node(node)
+
+        logger.debug(
+            "De-functionalized %s nodes, removed %s nodes", count, count_removed
+        )
+        self.nodes_to_remove.clear()
+
+
+MluHijackObject.apply_hijack(
+    FixFunctionalizationPass,
+    FixFunctionalizationPass.__call__,
+    FixFunctionalizationPass_MluHijack.__call__
+)
--- a/vllm_mlu/compilation/mlu_graph.py
+++ b/vllm_mlu/compilation/mlu_graph.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
+
+import dataclasses
+from collections.abc import Callable
+from contextlib import ExitStack
+from typing import Any
+from unittest.mock import patch
+
+import torch
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import validate_cudagraph_capturing_enabled
+from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import weak_ref_tensors
+from vllm.compilation.cuda_graph import (
+    CUDAGraphEntry,
+    CUDAGraphWrapper,
+    CUDAGraphOptions,
+)
+from vllm_mlu.v1.attention.backends.utils import MLUInferMode
+
+logger = init_logger(__name__)
+
+
+'''
+=============================
+Modify by vllm_mlu
+=============================
+@brief: specialized graph entry for prefill graphs
+'''
+@dataclasses.dataclass
+class PrefillGraphEntry:
+    batch_size: int = 0
+    seq_len: int = 0
+    cudagraph: torch.mlu.MLUGraph | None = None
+    output: Any | None = None
+
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: list[int] | None = None
+'''
+==================
+End of MLU Hijack
+==================
+'''
+
+
+class MLUGraphWrapper(CUDAGraphWrapper):
+
+    def __init__(
+        self,
+        runnable: Callable,
+        vllm_config: VllmConfig,
+        runtime_mode: CUDAGraphMode,
+        cudagraph_options: CUDAGraphOptions | None = None,
+    ):
+        super().__init__(runnable, vllm_config, runtime_mode, cudagraph_options)
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: add separate dict for prefill graph entries
+        '''
+        self.prefill_mlugraph_entry: PrefillGraphEntry | None = None
+        '''
+        ==================
+        End of MLU Hijack
+        ==================
+        '''
+
+    '''
+    =============================
+    Modify by vllm_mlu
+    =============================
+    @brief: check if running in prefill mode
+    '''
+    def is_running_in_prefill(self, entry: PrefillGraphEntry | None = None) -> bool:
+        forward_context = get_forward_context()
+        if forward_context.attn_metadata is None:
+            return False
+        infer_mode = forward_context.attn_metadata['common_metadata'].infer_mode
+        seq_lens_cpu = forward_context.attn_metadata['common_metadata'].seq_lens_cpu
+        if entry is not None \
+            and infer_mode == MLUInferMode.PREFILL_ONLY \
+            and seq_lens_cpu.size(0) == entry.batch_size \
+            and (seq_lens_cpu == entry.seq_len).all().item():
+            return True
+        return False
+    '''
+    ==================
+    End of MLU Hijack
+    ==================
+    '''
+
+    def __call__(
+            self, 
+            is_capturing_prefill: bool = False,
+            prefill_enable_mlugraph: bool = False,
+            prefill_batch_size: int = 0,
+            prefill_seq_len: int = 0,
+            is_running_drafter: bool = False,
+            *args, **kwargs):
+        forward_context = get_forward_context()
+        batch_descriptor = forward_context.batch_descriptor
+        cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode
+
+        if (
+            cudagraph_runtime_mode == CUDAGraphMode.NONE
+            or cudagraph_runtime_mode != self.runtime_mode
+        ):
+            # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
+            # running without cudagraphs.
+            # We do not trigger capture/replay if the runtime mode is not
+            # matches. This enables properly dispatching to the correct
+            # CUDAGraphWrapper when nesting multiple instances with different
+            # runtime modes.
+            return self.runnable(*args, **kwargs)
+
+        '''
+        =============================
+        Modify by vllm_mlu
+        =============================
+        @brief: handle prefill graph separately
+        @brief: skip check in running drafter model
+        '''
+        if is_capturing_prefill: # PREFILL capture
+            self.prefill_mlugraph_entry = PrefillGraphEntry(
+                batch_size=prefill_batch_size,
+                seq_len=prefill_seq_len)
+        else: # FULL/DECODE capture
+            if batch_descriptor not in self.concrete_cudagraph_entries:
+                # create a new entry for this batch descriptor
+                self.concrete_cudagraph_entries[batch_descriptor] = CUDAGraphEntry(
+                    batch_descriptor=batch_descriptor
+                )
+
+        if ((self.is_running_in_prefill(self.prefill_mlugraph_entry) and prefill_enable_mlugraph) 
+            or is_capturing_prefill):
+            entry = self.prefill_mlugraph_entry
+            logger.debug(
+                        f"Hitting a prefill cudagraph on {self.runtime_mode.name}, "
+                        f"batch_size: {entry.batch_size}, seq_len: {entry.seq_len}")
+        else: # FULL/DECODE capture
+            entry = self.concrete_cudagraph_entries[batch_descriptor]
+            logger.debug(
+                        "Hitting a decode cudagraph on (%s, %s)",
+                        self.runtime_mode.name,
+                        entry.batch_descriptor,
+                    )
+
+        if entry.cudagraph is None:
+            if self.cudagraph_options.debug_log_enable:
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every
+                # shape. E.g. we only log it for the first subgraph in
+                # piecewise mode.
+                if is_capturing_prefill:
+                    logger.debug(
+                        "Capturing a prefill cudagraph on (%s, batch_size=%d, seq_len=%d)",
+                        self.runtime_mode.name,
+                        entry.batch_size,
+                        entry.seq_len,
+                    )
+                else:
+                    logger.debug(
+                        "Capturing a decode cudagraph on (%s, %s)",
+                        self.runtime_mode.name,
+                        entry.batch_descriptor,
+                    )
+            if ((not is_capturing_prefill) and (not is_running_drafter)):
+                # validate that cudagraph capturing is legal at this point.
+                validate_cudagraph_capturing_enabled()
+            '''
+            ==================
+            End of MLU Hijack
+            ==================
+            '''
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.mlu.MLUGraph()
+
+            with ExitStack() as stack:
+                if self.cudagraph_options.gc_disable:
+                    # during every model forward for piecewise cudagraph
+                    # mode, we will capture many pieces of cudagraphs
+                    # (roughly one per layer). running gc again and again
+                    # across layers will make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(patch("torch.mlu.empty_cache", lambda: None))
+
+                if self.graph_pool is not None:
+                    set_graph_pool_id(self.graph_pool)
+                else:
+                    set_graph_pool_id(current_platform.graph_pool_handle())
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.mlu.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = self.runnable(*args, **kwargs)
+                    if self.cudagraph_options.weak_ref_output:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph in piecewise cuadgraph mode, because
+                        # the output of the last graph will not be used by
+                        # any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                f"Input addresses for cudagraphs are different "
+                f"during replay. Expected {entry.input_addresses}, "
+                f"got {new_input_addresses}"
+            )
+
+        entry.cudagraph.replay()
+        return entry.output