Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -21,6 +21,7 @@ from vllm.model_executor.offloader.base import get_offloader
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import current_stream, weak_ref_tensors
 from vllm.sequence import IntermediateTensors
+
 logger = init_logger(__name__)


@@ -204,14 +205,14 @@ class CUDAGraphWrapper:
    def unwrap(self) -> Callable[..., Any]:
        # in case we need to access the original runnable.
        return self.runnable
-
+    
    def weak_ref_tensors_with_intermediate(self, output):
        if isinstance(output, IntermediateTensors):
            intermediate_states = IntermediateTensors(
                    tensors={key: weak_ref_tensors(value) for key, value in output.tensors.items()})
            return intermediate_states
        return weak_ref_tensors(output)
-        
+
    def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
        forward_context = get_forward_context()
        batch_descriptor = forward_context.batch_descriptor
@@ -298,12 +299,10 @@ class CUDAGraphWrapper:
                        # the last graph in piecewise cuadgraph mode, because
                        # the output of the last graph will not be used by
                        # any other cuda graph.
-                        # output = weak_ref_tensors(output)
                        output = self.weak_ref_tensors_with_intermediate(output)

            # here we always use weak ref for the output
            # to save memory
-            # entry.output = weak_ref_tensors(output)
            entry.output = self.weak_ref_tensors_with_intermediate(output)
            entry.cudagraph = cudagraph