Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -21,6 +21,7 @@ from vllm.model_executor.offloader.base import get_offloader
from vllm.platforms import current_platform
from vllm.utils.torch_utils import current_stream, weak_ref_tensors
from vllm.sequence import IntermediateTensors
logger = init_logger(__name__)
@@ -204,14 +205,14 @@ class CUDAGraphWrapper:
def unwrap(self) -> Callable[..., Any]:
# in case we need to access the original runnable.
return self.runnable
def weak_ref_tensors_with_intermediate(self, output):
if isinstance(output, IntermediateTensors):
intermediate_states = IntermediateTensors(
tensors={key: weak_ref_tensors(value) for key, value in output.tensors.items()})
return intermediate_states
return weak_ref_tensors(output)
def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
forward_context = get_forward_context()
batch_descriptor = forward_context.batch_descriptor
@@ -298,12 +299,10 @@ class CUDAGraphWrapper:
# the last graph in piecewise cuadgraph mode, because
# the output of the last graph will not be used by
# any other cuda graph.
# output = weak_ref_tensors(output)
output = self.weak_ref_tensors_with_intermediate(output)
# here we always use weak ref for the output
# to save memory
# entry.output = weak_ref_tensors(output)
entry.output = self.weak_ref_tensors_with_intermediate(output)
entry.cudagraph = cudagraph