Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -282,61 +282,13 @@ class CompilerManager:
            maybe_key += f"{compile_range.start}_{compile_range.end}"
            maybe_key += f"_subgraph_{graph_index}"
        with self.compile_context(compile_range):
-            # There is a compilation time optimization here.
-            #
-            # If the (input metadata, graph, compiler config) are the same, then
-            # we want to avoid compiling the same artifact again. If we didn't
-            # do this optimization, the backend compilation (InductorAdaptor or
-            # InductorStandaloneAdaptor)
-            # is able to cache hit and produce an artifact faster if it was
-            # already created, but it is still a duplicate artifact that
-            # requires unnecessary things e.g. disk IO.
-            #
-            # The optimization is: If the backend compilation cache hits,
-            # then do an early return from the backend compilation and look up
-            # which of the previous in-memory artifacts we created to reuse.
-            #
-            # We implemented this by monkey-patching torch (torch does not
-            # easily expose the cache_key function), but in the future torch
-            # should expose the cache_key function that we can just call
-            # directly before invoking backend compilation.
-            cache_key = None
-            orig = torch._functorch._aot_autograd.autograd_cache.autograd_cache_key
-
-            def autograd_cache_key(*args, **kwargs):
-                result = orig(*args, **kwargs)
-                if result is None:
-                    return None
-                nonlocal cache_key
-                cache_key = result[0]
-                if cache_key in self.loaded_artifacts:
-                    raise StopCompiling()
-                return result
-
-            from unittest.mock import patch
-
-            with (
-                # Graphs that are isometric (different node names but same
-                # structure) should be treated as the same.
-                torch._functorch.config.patch(autograd_cache_normalize_inputs=True),
-                patch(
-                    "torch._functorch._aot_autograd.autograd_cache.autograd_cache_key",
-                    autograd_cache_key,
-                ),
-            ):
-                try:
-                    compiled_graph, handle = self.compiler.compile(
-                        graph,
-                        example_inputs,
-                        additional_inductor_config,
-                        compile_range,
-                        maybe_key,
-                    )
-                except StopCompiling:
-                    assert cache_key is not None
-                    return self.loaded_artifacts[cache_key]
-            if cache_key is not None and compiled_graph is not None:
-                self.loaded_artifacts[cache_key] = compiled_graph
+            compiled_graph, handle = self.compiler.compile(
+                graph,
+                example_inputs,
+                additional_inductor_config,
+                compile_range,
+                maybe_key,
+            )

        assert compiled_graph is not None, "Failed to compile the graph"

@@ -497,7 +449,7 @@ def wrap_with_cudagraph_if_needed(
    # it from the FULL cudagraph runtime mode, no matter it
    # is wrapped on a full or piecewise fx graph.
    return static_graph_wrapper_class(
-        runnable=piecewise_backend,
+        runnable=piecewise_backend.graph.forward,
        vllm_config=vllm_config,
        runtime_mode=CUDAGraphMode.PIECEWISE,
        cudagraph_options=CUDAGraphOptions(
@@ -780,7 +732,7 @@ class VllmBackend:
        return standalone_compile_artifacts, sym_shape_indices_map, returns_tuple_map

    def configure_post_pass(self) -> None:
-        # self.pass_manager.configure(self.vllm_config)
+        self.pass_manager.configure(self.vllm_config)

        # Post-grad custom passes are run using the post_grad_custom_post_pass
        # hook. If a pass for that hook exists, add it to the pass manager.
@@ -846,7 +798,7 @@ class VllmBackend:
            ),
        )

-    def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
+    def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any], **kwargs) -> Any:
        from .caching import (
            VllmSerializableFunction,
        )
@@ -988,7 +940,7 @@ class VllmBackend:
        assert not self._called, "VllmBackend can only be called once"

        self.graph = graph
-        self.configure_post_pass()
+        # self.configure_post_pass()

        if self.compilation_config.use_inductor_graph_partition:
            # Let Inductor decide partitioning; avoid FX-level pre-splitting.