Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -86,9 +86,16 @@ class CUDAGraphMode(enum.Enum):
|
||||
|
||||
def separate_routine(self) -> bool:
|
||||
return isinstance(self.value, tuple)
|
||||
|
||||
def decode_use_graph(self) -> bool:
|
||||
return self.decode_mode() == CUDAGraphMode.FULL
|
||||
|
||||
def valid_runtime_modes(self) -> bool:
|
||||
return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]
|
||||
@classmethod
|
||||
def valid_runtime_modes(cls) -> frozenset["CUDAGraphMode"]:
|
||||
return frozenset({cls.NONE, cls.PIECEWISE, cls.FULL})
|
||||
|
||||
def is_valid_runtime_mode(self) -> bool:
|
||||
return self in CUDAGraphMode.valid_runtime_modes()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
@@ -385,7 +392,7 @@ class CompilationConfig:
|
||||
Please use mode. Currently all levels are mapped to mode.
|
||||
"""
|
||||
# Top-level Compilation control
|
||||
mode: CompilationMode = Field(default=None)
|
||||
mode: CompilationMode = Field(default=CompilationMode.NONE)
|
||||
"""The compilation approach used for torch.compile-based compilation of the
|
||||
model.
|
||||
|
||||
@@ -503,7 +510,7 @@ class CompilationConfig:
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
|
||||
|
||||
# CudaGraph compilation
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=None)
|
||||
cudagraph_mode: CUDAGraphMode = Field(default=CUDAGraphMode.FULL_DECODE_ONLY)
|
||||
"""
|
||||
The mode of the cudagraph:
|
||||
|
||||
@@ -1003,6 +1010,7 @@ class CompilationConfig:
|
||||
# https://github.com/vllm-project/vllm/issues/33267
|
||||
if not self.use_inductor_graph_partition:
|
||||
self.splitting_ops.append("vllm::unified_kv_cache_update")
|
||||
self.splitting_ops.append("vllm::unified_mla_kv_cache_update")
|
||||
|
||||
elif len(self.splitting_ops) == 0:
|
||||
if (
|
||||
@@ -1045,7 +1053,7 @@ class CompilationConfig:
|
||||
"are optimized for prefill and are incompatible with CUDA Graphs. "
|
||||
"In order to use CUDA Graphs for decode-optimized workloads, "
|
||||
"use --all2all-backend with another option, such as "
|
||||
"deepep_low_latency, pplx, or allgather_reducescatter."
|
||||
"deepep_low_latency or allgather_reducescatter."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
|
||||
Reference in New Issue
Block a user