Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -282,61 +282,13 @@ class CompilerManager:
|
||||
maybe_key += f"{compile_range.start}_{compile_range.end}"
|
||||
maybe_key += f"_subgraph_{graph_index}"
|
||||
with self.compile_context(compile_range):
|
||||
# There is a compilation time optimization here.
|
||||
#
|
||||
# If the (input metadata, graph, compiler config) are the same, then
|
||||
# we want to avoid compiling the same artifact again. If we didn't
|
||||
# do this optimization, the backend compilation (InductorAdaptor or
|
||||
# InductorStandaloneAdaptor)
|
||||
# is able to cache hit and produce an artifact faster if it was
|
||||
# already created, but it is still a duplicate artifact that
|
||||
# requires unnecessary things e.g. disk IO.
|
||||
#
|
||||
# The optimization is: If the backend compilation cache hits,
|
||||
# then do an early return from the backend compilation and look up
|
||||
# which of the previous in-memory artifacts we created to reuse.
|
||||
#
|
||||
# We implemented this by monkey-patching torch (torch does not
|
||||
# easily expose the cache_key function), but in the future torch
|
||||
# should expose the cache_key function that we can just call
|
||||
# directly before invoking backend compilation.
|
||||
cache_key = None
|
||||
orig = torch._functorch._aot_autograd.autograd_cache.autograd_cache_key
|
||||
|
||||
def autograd_cache_key(*args, **kwargs):
|
||||
result = orig(*args, **kwargs)
|
||||
if result is None:
|
||||
return None
|
||||
nonlocal cache_key
|
||||
cache_key = result[0]
|
||||
if cache_key in self.loaded_artifacts:
|
||||
raise StopCompiling()
|
||||
return result
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
with (
|
||||
# Graphs that are isometric (different node names but same
|
||||
# structure) should be treated as the same.
|
||||
torch._functorch.config.patch(autograd_cache_normalize_inputs=True),
|
||||
patch(
|
||||
"torch._functorch._aot_autograd.autograd_cache.autograd_cache_key",
|
||||
autograd_cache_key,
|
||||
),
|
||||
):
|
||||
try:
|
||||
compiled_graph, handle = self.compiler.compile(
|
||||
graph,
|
||||
example_inputs,
|
||||
additional_inductor_config,
|
||||
compile_range,
|
||||
maybe_key,
|
||||
)
|
||||
except StopCompiling:
|
||||
assert cache_key is not None
|
||||
return self.loaded_artifacts[cache_key]
|
||||
if cache_key is not None and compiled_graph is not None:
|
||||
self.loaded_artifacts[cache_key] = compiled_graph
|
||||
compiled_graph, handle = self.compiler.compile(
|
||||
graph,
|
||||
example_inputs,
|
||||
additional_inductor_config,
|
||||
compile_range,
|
||||
maybe_key,
|
||||
)
|
||||
|
||||
assert compiled_graph is not None, "Failed to compile the graph"
|
||||
|
||||
@@ -497,7 +449,7 @@ def wrap_with_cudagraph_if_needed(
|
||||
# it from the FULL cudagraph runtime mode, no matter it
|
||||
# is wrapped on a full or piecewise fx graph.
|
||||
return static_graph_wrapper_class(
|
||||
runnable=piecewise_backend,
|
||||
runnable=piecewise_backend.graph.forward,
|
||||
vllm_config=vllm_config,
|
||||
runtime_mode=CUDAGraphMode.PIECEWISE,
|
||||
cudagraph_options=CUDAGraphOptions(
|
||||
@@ -780,7 +732,7 @@ class VllmBackend:
|
||||
return standalone_compile_artifacts, sym_shape_indices_map, returns_tuple_map
|
||||
|
||||
def configure_post_pass(self) -> None:
|
||||
# self.pass_manager.configure(self.vllm_config)
|
||||
self.pass_manager.configure(self.vllm_config)
|
||||
|
||||
# Post-grad custom passes are run using the post_grad_custom_post_pass
|
||||
# hook. If a pass for that hook exists, add it to the pass manager.
|
||||
@@ -846,7 +798,7 @@ class VllmBackend:
|
||||
),
|
||||
)
|
||||
|
||||
def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
|
||||
def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any], **kwargs) -> Any:
|
||||
from .caching import (
|
||||
VllmSerializableFunction,
|
||||
)
|
||||
@@ -988,7 +940,7 @@ class VllmBackend:
|
||||
assert not self._called, "VllmBackend can only be called once"
|
||||
|
||||
self.graph = graph
|
||||
self.configure_post_pass()
|
||||
# self.configure_post_pass()
|
||||
|
||||
if self.compilation_config.use_inductor_graph_partition:
|
||||
# Let Inductor decide partitioning; avoid FX-level pre-splitting.
|
||||
|
||||
Reference in New Issue
Block a user