Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -282,61 +282,13 @@ class CompilerManager:
maybe_key += f"{compile_range.start}_{compile_range.end}"
maybe_key += f"_subgraph_{graph_index}"
with self.compile_context(compile_range):
# There is a compilation time optimization here.
#
# If the (input metadata, graph, compiler config) are the same, then
# we want to avoid compiling the same artifact again. If we didn't
# do this optimization, the backend compilation (InductorAdaptor or
# InductorStandaloneAdaptor)
# is able to cache hit and produce an artifact faster if it was
# already created, but it is still a duplicate artifact that
# requires unnecessary things e.g. disk IO.
#
# The optimization is: If the backend compilation cache hits,
# then do an early return from the backend compilation and look up
# which of the previous in-memory artifacts we created to reuse.
#
# We implemented this by monkey-patching torch (torch does not
# easily expose the cache_key function), but in the future torch
# should expose the cache_key function that we can just call
# directly before invoking backend compilation.
cache_key = None
orig = torch._functorch._aot_autograd.autograd_cache.autograd_cache_key
def autograd_cache_key(*args, **kwargs):
result = orig(*args, **kwargs)
if result is None:
return None
nonlocal cache_key
cache_key = result[0]
if cache_key in self.loaded_artifacts:
raise StopCompiling()
return result
from unittest.mock import patch
with (
# Graphs that are isometric (different node names but same
# structure) should be treated as the same.
torch._functorch.config.patch(autograd_cache_normalize_inputs=True),
patch(
"torch._functorch._aot_autograd.autograd_cache.autograd_cache_key",
autograd_cache_key,
),
):
try:
compiled_graph, handle = self.compiler.compile(
graph,
example_inputs,
additional_inductor_config,
compile_range,
maybe_key,
)
except StopCompiling:
assert cache_key is not None
return self.loaded_artifacts[cache_key]
if cache_key is not None and compiled_graph is not None:
self.loaded_artifacts[cache_key] = compiled_graph
compiled_graph, handle = self.compiler.compile(
graph,
example_inputs,
additional_inductor_config,
compile_range,
maybe_key,
)
assert compiled_graph is not None, "Failed to compile the graph"
@@ -497,7 +449,7 @@ def wrap_with_cudagraph_if_needed(
# it from the FULL cudagraph runtime mode, no matter it
# is wrapped on a full or piecewise fx graph.
return static_graph_wrapper_class(
runnable=piecewise_backend,
runnable=piecewise_backend.graph.forward,
vllm_config=vllm_config,
runtime_mode=CUDAGraphMode.PIECEWISE,
cudagraph_options=CUDAGraphOptions(
@@ -780,7 +732,7 @@ class VllmBackend:
return standalone_compile_artifacts, sym_shape_indices_map, returns_tuple_map
def configure_post_pass(self) -> None:
# self.pass_manager.configure(self.vllm_config)
self.pass_manager.configure(self.vllm_config)
# Post-grad custom passes are run using the post_grad_custom_post_pass
# hook. If a pass for that hook exists, add it to the pass manager.
@@ -846,7 +798,7 @@ class VllmBackend:
),
)
def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any], **kwargs) -> Any:
from .caching import (
VllmSerializableFunction,
)
@@ -988,7 +940,7 @@ class VllmBackend:
assert not self._called, "VllmBackend can only be called once"
self.graph = graph
self.configure_post_pass()
# self.configure_post_pass()
if self.compilation_config.use_inductor_graph_partition:
# Let Inductor decide partitioning; avoid FX-level pre-splitting.