upgrade vLLM to main (#4608)

1. fix https://github.com/vllm-project/vllm/pull/28542
The model structure modifications we involved in are:
     - Qwen2.5-VL(still exist some patch)
     - Qwen2-VL
     - Qwen2
     - DeepSeek series
     - Qwen-moe series
2. fix https://github.com/vllm-project/vllm/pull/29121
   the output token now  type changed from np to `list[list[int]]`

3. fix https://github.com/vllm-project/vllm/pull/29262
    `xformers` backend for multimodal now has been deprecated
4. fix https://github.com/vllm-project/vllm/pull/29342

5. fix https://github.com/vllm-project/vllm/pull/28579
6. fix https://github.com/vllm-project/vllm/pull/28718
7. fix https://github.com/vllm-project/vllm/issues/28665
8. fix https://github.com/vllm-project/vllm/pull/26847
vllm introduced the `optimization-level`, some default config has been
changed, and the param `--enforce-eager` has been deprecated
9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple
for sampler.
10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the
related patch to avoid this kind of error.

Co-authored-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>


- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
wangxiyuan
2025-12-02 22:10:52 +08:00
committed by GitHub
parent 4588cdac02
commit 7f2673ea2d
60 changed files with 383 additions and 374 deletions

View File

@@ -9,14 +9,14 @@ from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional, Sequence
import torch
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import get_pp_group, get_tp_group
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.utils import logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
MLAAttentionSpec)

View File

@@ -2,7 +2,8 @@ import time
from collections import defaultdict
from typing import Optional
from vllm.utils import logger, sha256
from vllm.logger import logger
from vllm.utils.hashing import sha256
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
PrefixCachingMetrics)

View File

@@ -9,7 +9,7 @@ import torch
import vllm.envs as envs
import zmq
from vllm.config import KVTransferConfig, VllmConfig
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.utils.torch_utils import get_dtype_size
from vllm.v1.kv_cache_interface import AttentionSpec

View File

@@ -8,7 +8,7 @@ from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.forward_context import ForwardContext
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput

View File

@@ -3,7 +3,7 @@ from enum import Enum
import torch
from vllm.config import ParallelConfig
from vllm.utils import logger
from vllm.logger import logger
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -7,7 +7,7 @@ from typing import Union
# Third Party
from vllm.config import ParallelConfig
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import get_ip
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -3,7 +3,7 @@ from typing import Iterable, List, Optional, Tuple, Union
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
KVConnectorMetadata
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.math_utils import cdiv
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm.v1.core.sched.output import NewRequestData

View File

@@ -4,7 +4,7 @@ from concurrent.futures import ThreadPoolExecutor
from typing import Any, Optional
import torch
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -5,7 +5,7 @@ import zmq
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
KVConnectorMetadata
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import make_zmq_socket
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.kv_cache_utils import BlockHash

View File

@@ -8,7 +8,7 @@ from vllm.distributed import (get_decode_context_model_parallel_rank,
get_decode_context_model_parallel_world_size,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.kv_cache_utils import BlockHash
from vllm_ascend.distributed.kvpool.backend.backend import Backend

View File

@@ -25,7 +25,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
get_world_group)
from vllm.forward_context import ForwardContext
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig

View File

@@ -29,7 +29,7 @@ from vllm.distributed.parallel_state import (
get_decode_context_model_parallel_rank,
get_decode_context_model_parallel_world_size,
get_tensor_model_parallel_rank, get_tp_group)
from vllm.utils import logger
from vllm.logger import logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import RequestStatus

View File

@@ -27,7 +27,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
get_tp_group, get_world_group)
from vllm.utils import logger
from vllm.logger import logger
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig