diff --git a/Dockerfile b/Dockerfile index 62b7be9..51d3df7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,15 @@ -FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.1-x86-ubuntu20.04-py3.10-poc-llm-infer:20250731115755 +FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.3-x86-ubuntu20.04-py3.10-poc-llm-infer:v1.2.3 RUN pip install --no-cache-dir triton==2.1.0 COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info -COPY pkgs/xformers-0.0.22+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/xformers-0.0.22+corex.4.1.2.dist-info -COPY pkgs/xformers /usr/local/corex/lib64/python3/dist-packages/xformers -COPY paged_attn.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/paged_attn.py -COPY __init__.py /usr/local/lib/python3.10/site-packages/vllm/triton_utils/__init__.py -COPY prefix_prefill.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/prefix_prefill.py +COPY paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py +COPY __init__.py /usr/local/corex/lib64/python3/dist-packages/vllm/triton_utils/__init__.py RUN mkdir /workspace WORKDIR /workspace/ COPY ./launch_service /workspace/launch_service -ENTRYPOINT ["./launch_service"] diff --git a/paged_attn.py b/paged_attn.py index 1741dd1..988f903 100644 --- a/paged_attn.py +++ b/paged_attn.py @@ -4,6 +4,7 @@ from typing import List, Optional, Tuple import torch from vllm import _custom_ops as ops + from vllm.attention.ops.prefix_prefill import context_attention_fwd # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. diff --git a/vllm/__pycache__/__init__.cpython-310.pyc b/vllm/__pycache__/__init__.cpython-310.pyc index 1190aa9..ba64550 100644 Binary files a/vllm/__pycache__/__init__.cpython-310.pyc and b/vllm/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/__pycache__/_core_ext.cpython-310.pyc b/vllm/__pycache__/_core_ext.cpython-310.pyc index aaa4f73..3c7a3da 100644 Binary files a/vllm/__pycache__/_core_ext.cpython-310.pyc and b/vllm/__pycache__/_core_ext.cpython-310.pyc differ diff --git a/vllm/__pycache__/_custom_ops.cpython-310.pyc b/vllm/__pycache__/_custom_ops.cpython-310.pyc index 4f0ece7..fc17fb1 100644 Binary files a/vllm/__pycache__/_custom_ops.cpython-310.pyc and b/vllm/__pycache__/_custom_ops.cpython-310.pyc differ diff --git a/vllm/__pycache__/_ipex_ops.cpython-310.pyc b/vllm/__pycache__/_ipex_ops.cpython-310.pyc index 69c6c0c..afea7b9 100644 Binary files a/vllm/__pycache__/_ipex_ops.cpython-310.pyc and b/vllm/__pycache__/_ipex_ops.cpython-310.pyc differ diff --git a/vllm/__pycache__/beam_search.cpython-310.pyc b/vllm/__pycache__/beam_search.cpython-310.pyc index cad0282..4e88877 100644 Binary files a/vllm/__pycache__/beam_search.cpython-310.pyc and b/vllm/__pycache__/beam_search.cpython-310.pyc differ diff --git a/vllm/__pycache__/block.cpython-310.pyc b/vllm/__pycache__/block.cpython-310.pyc index c27aa2b..2ec9664 100644 Binary files a/vllm/__pycache__/block.cpython-310.pyc and b/vllm/__pycache__/block.cpython-310.pyc differ diff --git a/vllm/__pycache__/config.cpython-310.pyc b/vllm/__pycache__/config.cpython-310.pyc index e54d5ee..7f93840 100644 Binary files a/vllm/__pycache__/config.cpython-310.pyc and b/vllm/__pycache__/config.cpython-310.pyc differ diff --git a/vllm/__pycache__/connections.cpython-310.pyc b/vllm/__pycache__/connections.cpython-310.pyc index d662859..977dbba 100644 Binary files a/vllm/__pycache__/connections.cpython-310.pyc and b/vllm/__pycache__/connections.cpython-310.pyc differ diff --git a/vllm/__pycache__/envs.cpython-310.pyc b/vllm/__pycache__/envs.cpython-310.pyc index bde38c9..223d6c0 100644 Binary files a/vllm/__pycache__/envs.cpython-310.pyc and b/vllm/__pycache__/envs.cpython-310.pyc differ diff --git a/vllm/__pycache__/forward_context.cpython-310.pyc b/vllm/__pycache__/forward_context.cpython-310.pyc index 539b909..135a5b3 100644 Binary files a/vllm/__pycache__/forward_context.cpython-310.pyc and b/vllm/__pycache__/forward_context.cpython-310.pyc differ diff --git a/vllm/__pycache__/logger.cpython-310.pyc b/vllm/__pycache__/logger.cpython-310.pyc index 15c9df2..5009ecf 100644 Binary files a/vllm/__pycache__/logger.cpython-310.pyc and b/vllm/__pycache__/logger.cpython-310.pyc differ diff --git a/vllm/__pycache__/outputs.cpython-310.pyc b/vllm/__pycache__/outputs.cpython-310.pyc index 448d4fe..932e79b 100644 Binary files a/vllm/__pycache__/outputs.cpython-310.pyc and b/vllm/__pycache__/outputs.cpython-310.pyc differ diff --git a/vllm/__pycache__/pooling_params.cpython-310.pyc b/vllm/__pycache__/pooling_params.cpython-310.pyc index 05d2402..da6c1ba 100644 Binary files a/vllm/__pycache__/pooling_params.cpython-310.pyc and b/vllm/__pycache__/pooling_params.cpython-310.pyc differ diff --git a/vllm/__pycache__/sampling_params.cpython-310.pyc b/vllm/__pycache__/sampling_params.cpython-310.pyc index 31c99bc..371575b 100644 Binary files a/vllm/__pycache__/sampling_params.cpython-310.pyc and b/vllm/__pycache__/sampling_params.cpython-310.pyc differ diff --git a/vllm/__pycache__/scalar_type.cpython-310.pyc b/vllm/__pycache__/scalar_type.cpython-310.pyc index 8aa5fca..5f2f2da 100644 Binary files a/vllm/__pycache__/scalar_type.cpython-310.pyc and b/vllm/__pycache__/scalar_type.cpython-310.pyc differ diff --git a/vllm/__pycache__/scripts.cpython-310.pyc b/vllm/__pycache__/scripts.cpython-310.pyc index 45eb857..3d8abe2 100644 Binary files a/vllm/__pycache__/scripts.cpython-310.pyc and b/vllm/__pycache__/scripts.cpython-310.pyc differ diff --git a/vllm/__pycache__/sequence.cpython-310.pyc b/vllm/__pycache__/sequence.cpython-310.pyc index ca6591b..de03038 100644 Binary files a/vllm/__pycache__/sequence.cpython-310.pyc and b/vllm/__pycache__/sequence.cpython-310.pyc differ diff --git a/vllm/__pycache__/tracing.cpython-310.pyc b/vllm/__pycache__/tracing.cpython-310.pyc index b4dd2a9..0914d51 100644 Binary files a/vllm/__pycache__/tracing.cpython-310.pyc and b/vllm/__pycache__/tracing.cpython-310.pyc differ diff --git a/vllm/__pycache__/utils.cpython-310.pyc b/vllm/__pycache__/utils.cpython-310.pyc index 3b90b20..02651de 100644 Binary files a/vllm/__pycache__/utils.cpython-310.pyc and b/vllm/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/__pycache__/version.cpython-310.pyc b/vllm/__pycache__/version.cpython-310.pyc index de6fe1d..42f846b 100644 Binary files a/vllm/__pycache__/version.cpython-310.pyc and b/vllm/__pycache__/version.cpython-310.pyc differ diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index ac4cce9..64a5534 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1102,4 +1102,4 @@ for k, v in names_and_values.items(): names_and_values_to_update[k] = hint_on_error(v) names_and_values.update(names_and_values_to_update) -del names_and_values_to_update, names_and_values, v, k, fn_type +del names_and_values_to_update, names_and_values, v, k, fn_type \ No newline at end of file diff --git a/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc b/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc index e3d7395..10fe168 100644 Binary files a/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc b/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc index 58eba13..e94d5a8 100644 Binary files a/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc differ diff --git a/vllm/adapter_commons/__pycache__/models.cpython-310.pyc b/vllm/adapter_commons/__pycache__/models.cpython-310.pyc index c430e4e..58b2682 100644 Binary files a/vllm/adapter_commons/__pycache__/models.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/models.cpython-310.pyc differ diff --git a/vllm/adapter_commons/__pycache__/request.cpython-310.pyc b/vllm/adapter_commons/__pycache__/request.cpython-310.pyc index 46b7e60..c5beddc 100644 Binary files a/vllm/adapter_commons/__pycache__/request.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/request.cpython-310.pyc differ diff --git a/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc b/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc index bd20645..9a9082d 100644 Binary files a/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc b/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc index 0637c3c..d00fc5c 100644 Binary files a/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc differ diff --git a/vllm/assets/__pycache__/__init__.cpython-310.pyc b/vllm/assets/__pycache__/__init__.cpython-310.pyc index 6f5c7c6..6d8da66 100644 Binary files a/vllm/assets/__pycache__/__init__.cpython-310.pyc and b/vllm/assets/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/assets/__pycache__/audio.cpython-310.pyc b/vllm/assets/__pycache__/audio.cpython-310.pyc index 4a4f7a8..f0638ff 100644 Binary files a/vllm/assets/__pycache__/audio.cpython-310.pyc and b/vllm/assets/__pycache__/audio.cpython-310.pyc differ diff --git a/vllm/assets/__pycache__/base.cpython-310.pyc b/vllm/assets/__pycache__/base.cpython-310.pyc index a5a838c..2976baa 100644 Binary files a/vllm/assets/__pycache__/base.cpython-310.pyc and b/vllm/assets/__pycache__/base.cpython-310.pyc differ diff --git a/vllm/assets/__pycache__/image.cpython-310.pyc b/vllm/assets/__pycache__/image.cpython-310.pyc index 644befd..7abde64 100644 Binary files a/vllm/assets/__pycache__/image.cpython-310.pyc and b/vllm/assets/__pycache__/image.cpython-310.pyc differ diff --git a/vllm/assets/__pycache__/video.cpython-310.pyc b/vllm/assets/__pycache__/video.cpython-310.pyc index 6d2fbac..098a5a1 100644 Binary files a/vllm/assets/__pycache__/video.cpython-310.pyc and b/vllm/assets/__pycache__/video.cpython-310.pyc differ diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 5eec78c..8e30381 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -27,4 +27,4 @@ class ImageAsset: """ image_path = get_vllm_public_assets(filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR) - return torch.load(image_path) + return torch.load(image_path, weights_only=True) diff --git a/vllm/attention/__pycache__/__init__.cpython-310.pyc b/vllm/attention/__pycache__/__init__.cpython-310.pyc index 080c3e8..99ccc3b 100644 Binary files a/vllm/attention/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/attention/__pycache__/layer.cpython-310.pyc b/vllm/attention/__pycache__/layer.cpython-310.pyc index 29f52c1..38235bb 100644 Binary files a/vllm/attention/__pycache__/layer.cpython-310.pyc and b/vllm/attention/__pycache__/layer.cpython-310.pyc differ diff --git a/vllm/attention/__pycache__/selector.cpython-310.pyc b/vllm/attention/__pycache__/selector.cpython-310.pyc index e6fdecb..382e7b5 100644 Binary files a/vllm/attention/__pycache__/selector.cpython-310.pyc and b/vllm/attention/__pycache__/selector.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc b/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc index 19f3f54..c61e24b 100644 Binary files a/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc b/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc index bb16759..ecf40f5 100644 Binary files a/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc and b/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc index 15f6fca..ca08c23 100644 Binary files a/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc index 810cba2..0d31402 100644 Binary files a/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc b/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc index efdfb57..212a05e 100644 Binary files a/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc and b/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc index 9721229..1547d66 100644 Binary files a/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc b/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc index 2b35a08..c57a295 100644 Binary files a/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc and b/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc b/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc index 08e0554..8d92265 100644 Binary files a/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc and b/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc index 5ce3aa7..3a982f1 100644 Binary files a/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc index d354027..bac8ec2 100644 Binary files a/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc b/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc index d0524c2..2647e0a 100644 Binary files a/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc and b/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/utils.cpython-310.pyc b/vllm/attention/backends/__pycache__/utils.cpython-310.pyc index a1e7d37..0669a5f 100644 Binary files a/vllm/attention/backends/__pycache__/utils.cpython-310.pyc and b/vllm/attention/backends/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc b/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc index 19ce106..e6bb887 100644 Binary files a/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc and b/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc differ diff --git a/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc b/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc index 591955c..4c36349 100644 Binary files a/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc b/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc index bb52c49..34922a6 100644 Binary files a/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc and b/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc differ diff --git a/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc b/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc index ddc8a86..5fe7b1e 100644 Binary files a/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc and b/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc differ diff --git a/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc b/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc index bb82b22..b31f3a1 100644 Binary files a/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc and b/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc differ diff --git a/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc b/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc index 33c30d5..f8bd2f5 100644 Binary files a/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc and b/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc differ diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc index ec2a31a..4231da0 100644 Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc index 71fa12d..45bec7c 100644 Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc differ diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc index e01d624..825470f 100644 Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc differ diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc index 306f346..3f9fde4 100644 Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 1741dd1..c90e8dd 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -4,7 +4,10 @@ from typing import List, Optional, Tuple import torch from vllm import _custom_ops as ops -from vllm.attention.ops.prefix_prefill import context_attention_fwd +from vllm.triton_utils import HAS_TRITON + +if HAS_TRITON: + from vllm.attention.ops.prefix_prefill import context_attention_fwd # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 9a39e2b..a2a649c 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -808,8 +808,6 @@ if triton.__version__ >= "2.1.0": ) return - import time - ts_beg = time.time() _fwd_kernel[grid]( q, k, @@ -860,6 +858,4 @@ if triton.__version__ >= "2.1.0": num_warps=NUM_WARPS, num_stages=1, ) - elapsed = time.time() - ts_beg - #print(f'{elapsed}: {BLOCK=}, {Lk=}, {Lk_padded=}, {BLOCK=}, {sliding_window=}, {NUM_WARPS=}') return diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index d1a0bd2..ae90e03 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -204,9 +204,6 @@ def which_attn_to_use( if selected_backend != _Backend.PALLAS: logger.info("Cannot use %s backend on TPU.", selected_backend) return _Backend.PALLAS - - if selected_backend == _Backend.FLASH_ATTN: - print("selected_backend == _Backend.FLASH_ATTN") if is_hip(): # AMD GPUs. diff --git a/vllm/compilation/__pycache__/__init__.cpython-310.pyc b/vllm/compilation/__pycache__/__init__.cpython-310.pyc index 004a088..8f5fb17 100644 Binary files a/vllm/compilation/__pycache__/__init__.cpython-310.pyc and b/vllm/compilation/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/compilation/__pycache__/backends.cpython-310.pyc b/vllm/compilation/__pycache__/backends.cpython-310.pyc index 1b80e58..3cdd64f 100644 Binary files a/vllm/compilation/__pycache__/backends.cpython-310.pyc and b/vllm/compilation/__pycache__/backends.cpython-310.pyc differ diff --git a/vllm/compilation/__pycache__/compile_context.cpython-310.pyc b/vllm/compilation/__pycache__/compile_context.cpython-310.pyc index dc22671..237dede 100644 Binary files a/vllm/compilation/__pycache__/compile_context.cpython-310.pyc and b/vllm/compilation/__pycache__/compile_context.cpython-310.pyc differ diff --git a/vllm/compilation/__pycache__/decorators.cpython-310.pyc b/vllm/compilation/__pycache__/decorators.cpython-310.pyc index 8a478e6..822cc60 100644 Binary files a/vllm/compilation/__pycache__/decorators.cpython-310.pyc and b/vllm/compilation/__pycache__/decorators.cpython-310.pyc differ diff --git a/vllm/compilation/__pycache__/levels.cpython-310.pyc b/vllm/compilation/__pycache__/levels.cpython-310.pyc index 1a93d5b..50e2e9f 100644 Binary files a/vllm/compilation/__pycache__/levels.cpython-310.pyc and b/vllm/compilation/__pycache__/levels.cpython-310.pyc differ diff --git a/vllm/compilation/__pycache__/wrapper.cpython-310.pyc b/vllm/compilation/__pycache__/wrapper.cpython-310.pyc index 8eeb143..d02e998 100644 Binary files a/vllm/compilation/__pycache__/wrapper.cpython-310.pyc and b/vllm/compilation/__pycache__/wrapper.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/__init__.cpython-310.pyc b/vllm/core/__pycache__/__init__.cpython-310.pyc index 9f6eb9f..41f5688 100644 Binary files a/vllm/core/__pycache__/__init__.cpython-310.pyc and b/vllm/core/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc b/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc index 8c402a8..8c4fe9e 100644 Binary files a/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc and b/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc b/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc index dd98dec..7d29f4e 100644 Binary files a/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc and b/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/evictor_v1.cpython-310.pyc b/vllm/core/__pycache__/evictor_v1.cpython-310.pyc index c3ea008..2caaca7 100644 Binary files a/vllm/core/__pycache__/evictor_v1.cpython-310.pyc and b/vllm/core/__pycache__/evictor_v1.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/evictor_v2.cpython-310.pyc b/vllm/core/__pycache__/evictor_v2.cpython-310.pyc index 6f0fe90..d0da42d 100644 Binary files a/vllm/core/__pycache__/evictor_v2.cpython-310.pyc and b/vllm/core/__pycache__/evictor_v2.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/interfaces.cpython-310.pyc b/vllm/core/__pycache__/interfaces.cpython-310.pyc index 444f4a8..7fed706 100644 Binary files a/vllm/core/__pycache__/interfaces.cpython-310.pyc and b/vllm/core/__pycache__/interfaces.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc b/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc index 34b1c47..923858d 100644 Binary files a/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc and b/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc differ diff --git a/vllm/core/__pycache__/scheduler.cpython-310.pyc b/vllm/core/__pycache__/scheduler.cpython-310.pyc index ac0dc40..0f0d191 100644 Binary files a/vllm/core/__pycache__/scheduler.cpython-310.pyc and b/vllm/core/__pycache__/scheduler.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/__init__.cpython-310.pyc b/vllm/core/block/__pycache__/__init__.cpython-310.pyc index c456286..34aafd7 100644 Binary files a/vllm/core/block/__pycache__/__init__.cpython-310.pyc and b/vllm/core/block/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/block_table.cpython-310.pyc b/vllm/core/block/__pycache__/block_table.cpython-310.pyc index 5600c63..08a2cb6 100644 Binary files a/vllm/core/block/__pycache__/block_table.cpython-310.pyc and b/vllm/core/block/__pycache__/block_table.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/common.cpython-310.pyc b/vllm/core/block/__pycache__/common.cpython-310.pyc index 5ac6490..61171ac 100644 Binary files a/vllm/core/block/__pycache__/common.cpython-310.pyc and b/vllm/core/block/__pycache__/common.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc b/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc index b928f93..fe2ad47 100644 Binary files a/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc and b/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/interfaces.cpython-310.pyc b/vllm/core/block/__pycache__/interfaces.cpython-310.pyc index 5b72ccf..209e865 100644 Binary files a/vllm/core/block/__pycache__/interfaces.cpython-310.pyc and b/vllm/core/block/__pycache__/interfaces.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/naive_block.cpython-310.pyc b/vllm/core/block/__pycache__/naive_block.cpython-310.pyc index 859d45a..b6ab38d 100644 Binary files a/vllm/core/block/__pycache__/naive_block.cpython-310.pyc and b/vllm/core/block/__pycache__/naive_block.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc b/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc index ccc5f07..ef02311 100644 Binary files a/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc and b/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc differ diff --git a/vllm/core/block/__pycache__/utils.cpython-310.pyc b/vllm/core/block/__pycache__/utils.cpython-310.pyc index c5f9386..bf15d4d 100644 Binary files a/vllm/core/block/__pycache__/utils.cpython-310.pyc and b/vllm/core/block/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/distributed/__pycache__/__init__.cpython-310.pyc b/vllm/distributed/__pycache__/__init__.cpython-310.pyc index 96bce1f..912dfd7 100644 Binary files a/vllm/distributed/__pycache__/__init__.cpython-310.pyc and b/vllm/distributed/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/distributed/__pycache__/communication_op.cpython-310.pyc b/vllm/distributed/__pycache__/communication_op.cpython-310.pyc index d3c14b3..52ccfdc 100644 Binary files a/vllm/distributed/__pycache__/communication_op.cpython-310.pyc and b/vllm/distributed/__pycache__/communication_op.cpython-310.pyc differ diff --git a/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc b/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc index d3779a1..f696dc6 100644 Binary files a/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc and b/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc differ diff --git a/vllm/distributed/__pycache__/utils.cpython-310.pyc b/vllm/distributed/__pycache__/utils.cpython-310.pyc index 424b3ac..c4fac5b 100644 Binary files a/vllm/distributed/__pycache__/utils.cpython-310.pyc and b/vllm/distributed/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc index 58d4309..2619583 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc index f0c4225..b88d48c 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc index 17f8c24..045bef6 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc index e215e78..44034d5 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc index 222c197..7018b7e 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc index 19d5c14..0563cd4 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc index b63c55a..f2b9616 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc index 7c438a6..bbe4348 100644 Binary files a/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc differ diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 7d526b2..c41a505 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -217,7 +217,8 @@ class MessageQueue: remote_subscribe_port = get_open_port() if is_valid_ipv6_address(connect_ip): self.remote_socket.setsockopt(IPV6, 1) - socket_addr = f"tcp://*:{remote_subscribe_port}" + connect_ip = f"[{connect_ip}]" + socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}" self.remote_socket.bind(socket_addr) else: diff --git a/vllm/engine/__pycache__/__init__.cpython-310.pyc b/vllm/engine/__pycache__/__init__.cpython-310.pyc index 1312f8c..20b1de9 100644 Binary files a/vllm/engine/__pycache__/__init__.cpython-310.pyc and b/vllm/engine/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/arg_utils.cpython-310.pyc b/vllm/engine/__pycache__/arg_utils.cpython-310.pyc index cd5d308..93db9cb 100644 Binary files a/vllm/engine/__pycache__/arg_utils.cpython-310.pyc and b/vllm/engine/__pycache__/arg_utils.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc b/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc index 9d8f931..ecb32f4 100644 Binary files a/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc and b/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/async_timeout.cpython-310.pyc b/vllm/engine/__pycache__/async_timeout.cpython-310.pyc index 941df68..1f080ed 100644 Binary files a/vllm/engine/__pycache__/async_timeout.cpython-310.pyc and b/vllm/engine/__pycache__/async_timeout.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/llm_engine.cpython-310.pyc b/vllm/engine/__pycache__/llm_engine.cpython-310.pyc index 38aed70..c106623 100644 Binary files a/vllm/engine/__pycache__/llm_engine.cpython-310.pyc and b/vllm/engine/__pycache__/llm_engine.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/metrics.cpython-310.pyc b/vllm/engine/__pycache__/metrics.cpython-310.pyc index 0964a20..7067853 100644 Binary files a/vllm/engine/__pycache__/metrics.cpython-310.pyc and b/vllm/engine/__pycache__/metrics.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/metrics_types.cpython-310.pyc b/vllm/engine/__pycache__/metrics_types.cpython-310.pyc index c952429..0ed046f 100644 Binary files a/vllm/engine/__pycache__/metrics_types.cpython-310.pyc and b/vllm/engine/__pycache__/metrics_types.cpython-310.pyc differ diff --git a/vllm/engine/__pycache__/protocol.cpython-310.pyc b/vllm/engine/__pycache__/protocol.cpython-310.pyc index e01213f..0e434b6 100644 Binary files a/vllm/engine/__pycache__/protocol.cpython-310.pyc and b/vllm/engine/__pycache__/protocol.cpython-310.pyc differ diff --git a/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc b/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc index 025eaf5..18aafdf 100644 Binary files a/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc and b/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc b/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc index 8b83b75..a24654b 100644 Binary files a/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc and b/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc differ diff --git a/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc b/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc index 2ebdd9c..52c5583 100644 Binary files a/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc and b/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc differ diff --git a/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc index 6280127..d16adb2 100644 Binary files a/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc index ca6075f..13ebd7d 100644 Binary files a/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc differ diff --git a/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc index 5ab7771..8dc16c9 100644 Binary files a/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc differ diff --git a/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc index c447d33..6bbc943 100644 Binary files a/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc differ diff --git a/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc index f26ea79..c778982 100644 Binary files a/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc differ diff --git a/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc index 2918745..14ba774 100644 Binary files a/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc differ diff --git a/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc index 8efbea9..8ebcd41 100644 Binary files a/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc and b/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc b/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc index 0704a52..48c4537 100644 Binary files a/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc and b/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc differ diff --git a/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc b/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc index a1b01ce..37ddfb7 100644 Binary files a/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc and b/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc differ diff --git a/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc b/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc index b8dfa82..b67935d 100644 Binary files a/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc and b/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc differ diff --git a/vllm/entrypoints/__pycache__/llm.cpython-310.pyc b/vllm/entrypoints/__pycache__/llm.cpython-310.pyc index 0f22fd2..3f0bb4d 100644 Binary files a/vllm/entrypoints/__pycache__/llm.cpython-310.pyc and b/vllm/entrypoints/__pycache__/llm.cpython-310.pyc differ diff --git a/vllm/entrypoints/__pycache__/logger.cpython-310.pyc b/vllm/entrypoints/__pycache__/logger.cpython-310.pyc index 0df3711..e603f65 100644 Binary files a/vllm/entrypoints/__pycache__/logger.cpython-310.pyc and b/vllm/entrypoints/__pycache__/logger.cpython-310.pyc differ diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 41354dc..5195491 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -161,7 +161,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "" if model_type == "mllama": return "<|image|>" - if model_type == "qwen2_vl": + if model_type in ("qwen2_vl","qwen2_5_vl"): return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "molmo": return "" @@ -172,7 +172,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "<|reserved_special_token_0|>" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": - if model_type == "qwen2_vl": + if model_type in ("qwen2_vl","qwen2_5_vl"): return "<|vision_start|><|video_pad|><|vision_end|>" raise TypeError(f"Unknown model type: {model_type}") else: diff --git a/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc index 84bcaef..d5f09d2 100644 Binary files a/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc index 59e7f05..6f95699 100644 Binary files a/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc index d67a7cd..e8274a5 100644 Binary files a/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc index f28c2a5..45a3dbe 100644 Binary files a/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc index ddf2de4..a2b1492 100644 Binary files a/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc index 69572f5..acb744c 100644 Binary files a/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc index c9e2f78..ac60efe 100644 Binary files a/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc index 1928b66..4fb40e4 100644 Binary files a/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc index 32a12ca..32f8156 100644 Binary files a/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc index 8920d02..f33f7a0 100644 Binary files a/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc index 258edf8..73a6915 100644 Binary files a/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ae44b26..ef279d6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -3,7 +3,7 @@ import importlib import inspect import multiprocessing import os -import re +import regex as re import signal import socket import tempfile diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc index 79baa12..2b6e806 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc index ae2f3f3..77c6782 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc index 82faaba..f621c02 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc index e4db40f..15c820d 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc index ce23591..dc102bf 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc index b3cdc69..c5f6714 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc index 6d33704..31b9a81 100644 Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index bcbcda3..f3cc1d9 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,5 +1,5 @@ import json -import re +import regex as re from typing import Dict, List, Sequence, Union import partial_json_parser diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 3cf34bc..f950ab1 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,5 +1,5 @@ import json -import re +import regex as re from json import JSONDecodeError, JSONDecoder from typing import Dict, List, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index c6dc068..b7d5d0a 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,5 +1,5 @@ import json -import re +import regex as re from random import choices from string import ascii_letters, digits from typing import Dict, List, Sequence, Union diff --git a/vllm/envs.py b/vllm/envs.py index 4c9b4ae..3361afb 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -66,6 +66,7 @@ if TYPE_CHECKING: VLLM_SKIP_P2P_CHECK: bool = False VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False VLLM_TORCH_COMPILE_LEVEL: int = 0 + VLLM_V0_USE_OUTLINES_CACHE: bool = False def get_default_cache_root(): @@ -430,6 +431,12 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1": lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0" ) == "1", + + # Whether to turn on the outlines cache for V0 + # This cache is unbounded and on disk, so it's not safe to use in + # an environment with potentially malicious users. + "VLLM_V0_USE_OUTLINES_CACHE": + lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1", } # end-env-vars-definition diff --git a/vllm/executor/__pycache__/__init__.cpython-310.pyc b/vllm/executor/__pycache__/__init__.cpython-310.pyc index 06de5ea..a443572 100644 Binary files a/vllm/executor/__pycache__/__init__.cpython-310.pyc and b/vllm/executor/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc index 0032ddf..6286a08 100644 Binary files a/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc index ae7c76d..2a851d8 100644 Binary files a/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/executor_base.cpython-310.pyc b/vllm/executor/__pycache__/executor_base.cpython-310.pyc index ccef798..6c483d2 100644 Binary files a/vllm/executor/__pycache__/executor_base.cpython-310.pyc and b/vllm/executor/__pycache__/executor_base.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc index 42aa355..8d7cc79 100644 Binary files a/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc b/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc index d824251..9dd4390 100644 Binary files a/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc and b/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc index 57a1ff8..a9f62cd 100644 Binary files a/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc b/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc index b092866..7d6a611 100644 Binary files a/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc and b/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc index 4b31776..af0c391 100644 Binary files a/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc b/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc index 2b14ed3..778b237 100644 Binary files a/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc and b/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc b/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc index 79fbeb9..43c6b1f 100644 Binary files a/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc and b/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc index 2aaa6fe..e279213 100644 Binary files a/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc index 7ac3d79..eb316e4 100644 Binary files a/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/ray_utils.cpython-310.pyc b/vllm/executor/__pycache__/ray_utils.cpython-310.pyc index a509882..58e6d0d 100644 Binary files a/vllm/executor/__pycache__/ray_utils.cpython-310.pyc and b/vllm/executor/__pycache__/ray_utils.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc index b78487c..686dd9b 100644 Binary files a/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc index 02db6b6..59a4791 100644 Binary files a/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc differ diff --git a/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc index 651acb1..bbd8076 100644 Binary files a/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc differ diff --git a/vllm/inputs/__pycache__/__init__.cpython-310.pyc b/vllm/inputs/__pycache__/__init__.cpython-310.pyc index 565b6e4..1ecbfd4 100644 Binary files a/vllm/inputs/__pycache__/__init__.cpython-310.pyc and b/vllm/inputs/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/inputs/__pycache__/data.cpython-310.pyc b/vllm/inputs/__pycache__/data.cpython-310.pyc index b69bc51..437d732 100644 Binary files a/vllm/inputs/__pycache__/data.cpython-310.pyc and b/vllm/inputs/__pycache__/data.cpython-310.pyc differ diff --git a/vllm/inputs/__pycache__/parse.cpython-310.pyc b/vllm/inputs/__pycache__/parse.cpython-310.pyc index f11cc04..4664dc3 100644 Binary files a/vllm/inputs/__pycache__/parse.cpython-310.pyc and b/vllm/inputs/__pycache__/parse.cpython-310.pyc differ diff --git a/vllm/inputs/__pycache__/preprocess.cpython-310.pyc b/vllm/inputs/__pycache__/preprocess.cpython-310.pyc index 1857735..f0ab201 100644 Binary files a/vllm/inputs/__pycache__/preprocess.cpython-310.pyc and b/vllm/inputs/__pycache__/preprocess.cpython-310.pyc differ diff --git a/vllm/inputs/__pycache__/registry.cpython-310.pyc b/vllm/inputs/__pycache__/registry.cpython-310.pyc index c03e8fd..8eaef98 100644 Binary files a/vllm/inputs/__pycache__/registry.cpython-310.pyc and b/vllm/inputs/__pycache__/registry.cpython-310.pyc differ diff --git a/vllm/logging/__pycache__/__init__.cpython-310.pyc b/vllm/logging/__pycache__/__init__.cpython-310.pyc index f37e9d2..84691e3 100644 Binary files a/vllm/logging/__pycache__/__init__.cpython-310.pyc and b/vllm/logging/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/logging/__pycache__/formatter.cpython-310.pyc b/vllm/logging/__pycache__/formatter.cpython-310.pyc index afb951e..ac687a9 100644 Binary files a/vllm/logging/__pycache__/formatter.cpython-310.pyc and b/vllm/logging/__pycache__/formatter.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/__init__.cpython-310.pyc b/vllm/lora/__pycache__/__init__.cpython-310.pyc index 73ac80f..6a0040c 100644 Binary files a/vllm/lora/__pycache__/__init__.cpython-310.pyc and b/vllm/lora/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc b/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc index 652738e..7e13e76 100644 Binary files a/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc and b/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/layers.cpython-310.pyc b/vllm/lora/__pycache__/layers.cpython-310.pyc index bcc2b16..7d1ee4c 100644 Binary files a/vllm/lora/__pycache__/layers.cpython-310.pyc and b/vllm/lora/__pycache__/layers.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/lora.cpython-310.pyc b/vllm/lora/__pycache__/lora.cpython-310.pyc index 7a5f001..f0da988 100644 Binary files a/vllm/lora/__pycache__/lora.cpython-310.pyc and b/vllm/lora/__pycache__/lora.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/models.cpython-310.pyc b/vllm/lora/__pycache__/models.cpython-310.pyc index 0b47ee7..63d5a53 100644 Binary files a/vllm/lora/__pycache__/models.cpython-310.pyc and b/vllm/lora/__pycache__/models.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/punica.cpython-310.pyc b/vllm/lora/__pycache__/punica.cpython-310.pyc index 1b72815..aeedbe9 100644 Binary files a/vllm/lora/__pycache__/punica.cpython-310.pyc and b/vllm/lora/__pycache__/punica.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/request.cpython-310.pyc b/vllm/lora/__pycache__/request.cpython-310.pyc index fba3fa2..0d466ee 100644 Binary files a/vllm/lora/__pycache__/request.cpython-310.pyc and b/vllm/lora/__pycache__/request.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/utils.cpython-310.pyc b/vllm/lora/__pycache__/utils.cpython-310.pyc index cf125f8..76b259e 100644 Binary files a/vllm/lora/__pycache__/utils.cpython-310.pyc and b/vllm/lora/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/lora/__pycache__/worker_manager.cpython-310.pyc b/vllm/lora/__pycache__/worker_manager.cpython-310.pyc index f99a173..68fe572 100644 Binary files a/vllm/lora/__pycache__/worker_manager.cpython-310.pyc and b/vllm/lora/__pycache__/worker_manager.cpython-310.pyc differ diff --git a/vllm/lora/models.py b/vllm/lora/models.py index aaadca9..8805d05 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -2,7 +2,7 @@ import copy import json import math import os -import re +import regex as re from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional, Type @@ -263,7 +263,7 @@ class LoRAModel(AdapterModel): new_embeddings_tensor_path) elif os.path.isfile(new_embeddings_bin_file_path): embeddings = torch.load(new_embeddings_bin_file_path, - map_location=device) + map_location=device, weights_only=True) rank = config["r"] lora_alpha = config["lora_alpha"] diff --git a/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc b/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc index 9ede199..85eb3c1 100644 Binary files a/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc and b/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc b/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc index 217f22f..7e6e3cb 100644 Binary files a/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc and b/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc b/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc index da16abe..c64e274 100644 Binary files a/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc and b/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc b/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc index 0f98091..3938bb8 100644 Binary files a/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc and b/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc b/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc index 08cfc9f..1e54c33 100644 Binary files a/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc and b/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc b/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc index fdc90b3..51f1d6a 100644 Binary files a/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc and b/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc b/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc index e7f1bfa..00b0ff3 100644 Binary files a/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc and b/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc differ diff --git a/vllm/lora/ops/__pycache__/utils.cpython-310.pyc b/vllm/lora/ops/__pycache__/utils.cpython-310.pyc index dd89854..762bb68 100644 Binary files a/vllm/lora/ops/__pycache__/utils.cpython-310.pyc and b/vllm/lora/ops/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index a780429..066d94a 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,5 +1,5 @@ import os -import re +import regex as re from typing import List, Optional, Set, Tuple, Type, Union import huggingface_hub diff --git a/vllm/model_executor/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/__pycache__/__init__.cpython-310.pyc index 3e896cf..984b8c1 100644 Binary files a/vllm/model_executor/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc b/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc index 3548469..ddaac9a 100644 Binary files a/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc and b/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc differ diff --git a/vllm/model_executor/__pycache__/parameter.cpython-310.pyc b/vllm/model_executor/__pycache__/parameter.cpython-310.pyc index fe4df37..78a8ef1 100644 Binary files a/vllm/model_executor/__pycache__/parameter.cpython-310.pyc and b/vllm/model_executor/__pycache__/parameter.cpython-310.pyc differ diff --git a/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc b/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc index 354b163..c03a4f1 100644 Binary files a/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc and b/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc differ diff --git a/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc b/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc index c02d291..d919ff2 100644 Binary files a/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc and b/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc differ diff --git a/vllm/model_executor/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/__pycache__/utils.cpython-310.pyc index 4036015..96650f9 100644 Binary files a/vllm/model_executor/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc index 35cbb04..59e35da 100644 Binary files a/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc index 8db14b7..fbc7392 100644 Binary files a/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc differ diff --git a/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc index 071dcb0..04aec4d 100644 Binary files a/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc differ diff --git a/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc index 8de5d11..da22e84 100644 Binary files a/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc differ diff --git a/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc index 833a519..d6c6685 100644 Binary files a/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc differ diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index c28bd71..fb8db44 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -23,12 +23,21 @@ from typing import Callable, DefaultDict, Dict, List, Union import torch from lark import Lark from outlines import grammars -from outlines.caching import cache +from outlines.caching import cache, disable_cache from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write from outlines.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase +import vllm.envs as envs +from vllm.logger import init_logger +logger = init_logger(__name__) +if envs.VLLM_V0_USE_OUTLINES_CACHE: + logger.warning("Enabling outlines cache. This is an unbounded on-disk " + "cache. It may consume a lot of disk space and should " + "not be used with untrusted clients.") +else: + disable_cache() class BaseLogitsProcessor: diff --git a/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc index 49d5dd0..7d3f145 100644 Binary files a/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc index a6a2ab0..6d39d5d 100644 Binary files a/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc index 0f09c8c..e77cc9c 100644 Binary files a/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc index be35bf4..c1cc88f 100644 Binary files a/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc index bf46f27..f996ecf 100644 Binary files a/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc index a97b752..dbb2dae 100644 Binary files a/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc index e99e319..c6362a4 100644 Binary files a/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc index 7402f04..3807358 100644 Binary files a/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc index 4e5f06b..6eecd9e 100644 Binary files a/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc index 22d4863..4fa31a3 100644 Binary files a/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc index 879c57f..ae776c2 100644 Binary files a/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc index 6c3ed20..9315154 100644 Binary files a/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc index 2c1fcda..321681b 100644 Binary files a/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc index 51e1059..2b6e09f 100644 Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc index eea9303..e8c7e52 100644 Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc index ed50e46..0fd8959 100644 Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc index a3b03b7..9684cff 100644 Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc index a09e693..afcf218 100644 Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8ed74ef..789a77e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -132,7 +132,7 @@ class UnquantizedLinearMethod(LinearMethodBase): layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - if (x.shape[0] == 16384 or x.shape[0] == 15360): + if (x.shape[0] == 8192 or x.shape[0] == 16384 or x.shape[0] == 15360): if bias is None: return x @ layer.weight.T else: diff --git a/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc index 888ccc9..bff33db 100644 Binary files a/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc index a095ccb..db529d1 100644 Binary files a/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc b/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc index 5b34390..a809675 100644 Binary files a/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc and b/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc b/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc index 0613b1e..9df7c51 100644 Binary files a/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc and b/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc index 41b1bb6..c2312d9 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc index 970f5d3..97c6107 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc index c6291af..33b34e6 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc index 47ed177..bccc8e3 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc index 7f1d394..35acfdb 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc index 0759d0c..39e0f44 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc index a4f4198..5e8362e 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc index 6213ec2..a31daaa 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc index ea4f3a1..8957b27 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc index ac659f7..54b49c9 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc index e60b9d0..69d150f 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc index 63e7832..72e41da 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc index 7be428a..a9d75f1 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc index 3e2ccc9..d54eeef 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc index 7041cbc..fbc58d2 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc index f3d9348..4b11c2f 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc index e9a9e83..44e5b41 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc index 326ec50..5f1f5fc 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc index acafb63..87d5593 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc index 8e7cd18..540c07b 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc index 0993354..32780cf 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc index b0cf65f..e1ee39c 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc index 39b8434..470be32 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc index cc44e16..9a134f3 100644 Binary files a/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc index 6452a20..a551083 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc index d9325cc..db6f166 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc index 73b4a52..c2bceb0 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc index 1e28ce3..408ed02 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc index b30bdb5..d24a3fc 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc index 8bdd453..fe4692a 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc index 4c5d1b6..258d504 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc index 2f1b90c..a286914 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc index be73cc2..9d03344 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc index 8776b45..f2aa37c 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc index 7fea985..c84fb93 100644 Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index fc531b9..785f7ff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,4 +1,4 @@ -import re +import regex as re from enum import Enum from typing import Any, Dict, Iterable, Optional, Union diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc index caa2d19..6690d8b 100644 Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc index 72e7450..f2827fc 100644 Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc index 6a67b05..6c0760b 100644 Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc index 478759a..b226a8e 100644 Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc index 283737e..ea126a2 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc index 9c7dfc0..7a710e9 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc index 9ec857a..1d23696 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc index 338b186..8c4e2f5 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc index cc9b2e8..fbd93ab 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc index f8cb1bc..c5929a4 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc index 16f3a63..26930bc 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc index 119fea0..3bf1f4b 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc index 076fe51..5379065 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc differ diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc index 14fcc99..e32cfbf 100644 Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc index 3a5a9b4..93f94a4 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc index 82e3897..50280b7 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc index c6314a9..4d66454 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc index d2e597f..f256d7f 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc index 3bd15bb..8993fb1 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc index 059d9cd..33127c2 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc index ccaa569..a2a2667 100644 Binary files a/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc differ diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 36f33d6..da224ec 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -2,7 +2,7 @@ import argparse import dataclasses import io import os -import re +import regex as re import time from dataclasses import dataclass from functools import partial diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 1e2857e..746b0be 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -85,7 +85,7 @@ def convert_bin_to_safetensor_file( pt_filename: str, sf_filename: str, ) -> None: - loaded = torch.load(pt_filename, map_location="cpu") + loaded = torch.load(pt_filename, map_location="cpu", weights_only=True) if "state_dict" in loaded: loaded = loaded["state_dict"] shared = _shared_pointers(loaded) @@ -373,7 +373,7 @@ def np_cache_weights_iterator( disable=not enable_tqdm, bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu") + state = torch.load(bin_file, map_location="cpu", weights_only=True) for name, param in state.items(): param_path = os.path.join(np_folder, name) with open(param_path, "wb") as f: @@ -422,7 +422,7 @@ def pt_weights_iterator( disable=not enable_tqdm, bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu") + state = torch.load(bin_file, map_location="cpu", weights_only=True) for name, param in state.items(): yield name, param del state diff --git a/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc index 5ef0ec3..d17d8c8 100644 Binary files a/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc b/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc index 7eac109..3dd3e7c 100644 Binary files a/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc b/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc index 7318751..b43274e 100644 Binary files a/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc b/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc index 4c44deb..363f805 100644 Binary files a/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc b/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc index d540977..f27531a 100644 Binary files a/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc index 382fc21..ff0f67f 100644 Binary files a/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc b/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc index 81e0e5e..e2bdd4b 100644 Binary files a/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc index 8f0dc82..4d355f0 100644 Binary files a/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc index dadaa45..5a0e6b0 100644 Binary files a/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc b/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc index 7dccad7..8d371d8 100644 Binary files a/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc b/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc index cdfcd5d..91e8bc6 100644 Binary files a/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc b/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc index ddb1e64..2426736 100644 Binary files a/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc index 9479d4e..30d7d4c 100644 Binary files a/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc b/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc index 4089242..4533427 100644 Binary files a/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc index cde5da2..bb2b8a7 100644 Binary files a/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc b/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc index 52cb839..9aedd9d 100644 Binary files a/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc b/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc index 48f287a..3d88bd0 100644 Binary files a/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc index 90692d5..bba13aa 100644 Binary files a/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc b/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc index 81ff33d..03c9d37 100644 Binary files a/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc index e2608e5..31ad8d3 100644 Binary files a/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc index 6b8f9e3..eba3d7d 100644 Binary files a/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc index ce06079..601e6c3 100644 Binary files a/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc b/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc index b6c8e16..4d465ee 100644 Binary files a/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc b/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc index 1efd14e..2025bae 100644 Binary files a/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc index 3cc35e5..39602e6 100644 Binary files a/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc index 97c4c4c..6fa561a 100644 Binary files a/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc index 2025e1d..1375c93 100644 Binary files a/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc index f551e48..a0a3d4a 100644 Binary files a/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc b/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc index 2117e32..c48c518 100644 Binary files a/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc index 915ea81..92d5160 100644 Binary files a/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc b/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc index c0eb9ce..ddba331 100644 Binary files a/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc b/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc index bfc0a43..54a8e4b 100644 Binary files a/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc b/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc index bd8289c..30730cf 100644 Binary files a/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc b/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc index 2c2fe53..f6c30cc 100644 Binary files a/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc index cdf8002..e2ac992 100644 Binary files a/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc b/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc index c2207f6..72b1906 100644 Binary files a/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc b/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc index 68b8894..b727dd6 100644 Binary files a/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc b/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc index 088a165..753a1d6 100644 Binary files a/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc index 09e72b5..c2c4cd6 100644 Binary files a/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc index be7ed35..6ddb654 100644 Binary files a/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc index b0e994d..b0a04c7 100644 Binary files a/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc index 491a949..fb98303 100644 Binary files a/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc index ba9e031..e747e99 100644 Binary files a/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc index 3a3049e..a27059f 100644 Binary files a/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc index 32a4981..3cba92a 100644 Binary files a/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc index 9ea4aa7..b677ee7 100644 Binary files a/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc b/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc index 1576df2..ab11d57 100644 Binary files a/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc index f028c81..6695051 100644 Binary files a/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc b/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc index 24638a6..9afdf52 100644 Binary files a/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc b/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc index 7db5ee3..7bafcf1 100644 Binary files a/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc index ddfa63f..ee61e0b 100644 Binary files a/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc index f1fc24f..eaf2ba1 100644 Binary files a/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc index f41d430..8738631 100644 Binary files a/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc index 527a4fd..249a005 100644 Binary files a/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc b/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc index bf4fb66..bb01564 100644 Binary files a/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc b/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc index 6619caf..e5dbd8b 100644 Binary files a/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc index 7238d6f..cb952cb 100644 Binary files a/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc b/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc index 54a91ea..0d8568c 100644 Binary files a/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc b/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc index 829adaf..7d71113 100644 Binary files a/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc b/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc index db57da9..8d0aa1f 100644 Binary files a/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc index d428615..78f03c7 100644 Binary files a/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc b/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc index 1998f13..d13c4cc 100644 Binary files a/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc b/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc index 3e1747f..b6cf059 100644 Binary files a/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc b/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc index 06cef71..c87e787 100644 Binary files a/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc index 7102bb8..3a1f833 100644 Binary files a/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc index e081a8c..150fbf8 100644 Binary files a/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc index 55c4858..ee65fa5 100644 Binary files a/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc index c1d83d4..4e9d52a 100644 Binary files a/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc index daff7ba..359db8a 100644 Binary files a/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc index 9c26fe1..77f5a83 100644 Binary files a/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc b/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc index 2c1458f..7b4d3c7 100644 Binary files a/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc index d032ca4..339947a 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc index 06ac033..6c49490 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen2_5_vl.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_5_vl.cpython-310.pyc new file mode 100644 index 0000000..e96226c Binary files /dev/null and b/vllm/model_executor/models/__pycache__/qwen2_5_vl.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc index 6c33bbe..b62b317 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc index 8b2d8b0..01486a9 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc index de7b1b9..21a2a47 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc index 11f7142..58765bc 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc index 6aed6f1..61a305e 100644 Binary files a/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc b/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc index a3999c8..576a538 100644 Binary files a/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc b/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc index 186536a..b3c6fe0 100644 Binary files a/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc b/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc index bde0046..150fd0b 100644 Binary files a/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc index ba2924f..c4d0eb0 100644 Binary files a/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc index d16115e..f9accf5 100644 Binary files a/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc b/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc index 4c598e6..199e76a 100644 Binary files a/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc index b94a9ba..1f22f8d 100644 Binary files a/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc b/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc index 71a46f3..e621a3c 100644 Binary files a/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc differ diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 9024831..203badb 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -4,7 +4,7 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -import re +import regex as re from functools import cached_property, partial from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 9ee4dd0..0236fc4 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -22,7 +22,7 @@ # limitations under the License. """Inference-only MiniCPM-V model compatible with HuggingFace weights.""" import math -import re +import regex as re from functools import partial from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ccfee16..2ff4e04 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,6 +1,6 @@ import logging import math -import re +import regex as re from array import array from dataclasses import dataclass from functools import lru_cache, partial diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 00a04da..523375f 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import itertools -import re +import regex as re from functools import cached_property, lru_cache from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index fd8a27e..3c94173 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,7 +6,7 @@ """Inference-only QWen model compatible with HuggingFace weights.""" import math -import re +import regex as re from functools import partial from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py new file mode 100644 index 0000000..bae5e54 --- /dev/null +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -0,0 +1,1213 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from functools import lru_cache, partial +from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, + Tuple, Type, TypedDict, Union) + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from PIL import Image +from transformers.image_utils import (get_image_size, + infer_channel_dimension_format, + to_numpy_array) +from transformers.models.qwen2_vl.image_processing_qwen2_vl import (make_batched_videos, smart_resize) +from transformers.models.emu3.image_processing_emu3 import make_batched_images + +import vllm.envs as envs +from vllm.attention import AttentionMetadata +from vllm.attention.selector import (_Backend, backend_name_to_enum, + get_global_forced_attn_backend) +from vllm.config import CacheConfig, MultiModalConfig +from vllm.distributed import get_pp_group, parallel_state, tensor_model_parallel_all_gather +from vllm.distributed import utils as dist_utils +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.qwen2 import Qwen2Model +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalInputs) +from vllm.multimodal.base import MultiModalData +from vllm.multimodal.image import cached_get_image_processor +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors, SequenceData +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( + Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) +from vllm.transformers_utils.processor import get_processor +from vllm.utils import is_cpu + +from .interfaces import SupportsMultiModal, SupportsPP +from .utils import (PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory) + +logger = init_logger(__name__) + +# === Vision Inputs === # + + +class Qwen2_5_VLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: + `(num_patches, num_channels * patch_size * patch_size)` + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +class Qwen2_5_VLImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + `hidden_size` must match the hidden size of language model backbone. + """ + + +Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLImageEmbeddingInputs] + + +class Qwen2_5_VLVideoInputs(TypedDict): + pixel_values_videos: torch.Tensor + """Shape: + `(num_patches, + num_channels * temporal_patch_size * patch_size * patch_size)` + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +# === Vision Encoder === # + + +class Qwen2_5_VisionMLP(nn.Module): + + def __init__( + self, + in_features: int, + hidden_features: int = None, + bias: bool = False, + act_layer: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.gate_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config) + self.up_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config) + self.down_proj = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config) + self.act = act_layer + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_gate, _ = self.gate_proj(x) + x_gate = self.act(x_gate) + x_up, _ = self.up_proj(x) + x_down, _ = self.down_proj(x_gate * x_up) + return x_down + + +def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor: + if not interleaved: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + else: + x1, x2 = x[..., ::2], x[..., 1::2] + return rearrange(torch.stack((-x2, x1), dim=-1), + "... d two -> ... (d two)", + two=2) + + +def apply_rotary_emb_torch(x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + interleaved: bool = False) -> torch.Tensor: + """ + x: (batch_size, seqlen, nheads, headdim) + cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) + """ + ro_dim = cos.shape[-1] * 2 + assert ro_dim <= x.shape[-1] + cos = repeat( + cos, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + sin = repeat( + sin, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + return torch.cat( + [ + x[..., :ro_dim] * cos + + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] + ], + dim=-1, + ) + + +def apply_rotary_pos_emb_vision(t: torch.Tensor, + freqs: torch.Tensor) -> torch.Tensor: + t_ = t.float() + cos = freqs.cos() + sin = freqs.sin() + output = apply_rotary_emb_torch(t_, cos, sin).type_as(t) + return output + + +class Qwen2_5_VisionAttention(nn.Module): + + def __init__( + self, + embed_dim: Optional[int] = None, + num_heads: Optional[int] = None, + projection_size: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + # Per attention head and per partition values. + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size) + + self.qkv = ColumnParallelLinear(input_size=embed_dim, + output_size=3 * projection_size, + quant_config=quant_config) + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config) + + # Detect attention implementation. + selected_backend: Optional[_Backend] = get_global_forced_attn_backend() + if selected_backend is None: + backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is None: + # For Volta and Turing GPUs, use xformers instead. + device_available = current_platform.has_device_capability(80) + if device_available: + from transformers.utils import is_flash_attn_2_available + + if is_flash_attn_2_available(): + self._use_flash_attn = True + else: + logger.warning( + "Current Qwen2-VL implementation has a bug with " + "`vllm-flash-attn` inside vision module, so we use " + "xformers backend instead. You can run `pip install " + "flash-attn to use flash-attention backend.") + self._use_flash_attn = False + else: + self._use_flash_attn = False + else: + if selected_backend == _Backend.FLASH_ATTN: + self._use_flash_attn = True + elif selected_backend == _Backend.XFORMERS: + self._use_flash_attn = False + else: + raise RuntimeError( + f"Qwen2-5-VL does not support {selected_backend} backend now." + ) + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = tensor_model_parallel_all_gather(qkv) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] + if self.tp_size > 1: + splitter = partial(dist_utils.split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + v = splitter(v)[self.tp_rank] + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor = None, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + batch_size = q.shape[1] + + q, k, v = [ + rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v) + ] + if rotary_pos_emb is not None: + q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) + k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + + from ixformer.contrib.xformers import ops as xops + from xformers.ops.fmha.attn_bias import (AttentionBias, + BlockDiagonalMask,) + + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None) + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None, op=xops.fmha.flash.FwOp()) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() + + output, _ = self.proj(context_layer) + return output + + +class Qwen2_5_VisionBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_layer: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Type[nn.Module] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + + + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config) + self.mlp = Qwen2_5_VisionMLP(in_features=dim, + hidden_features=mlp_hidden_dim, + bias=True, + act_layer=act_layer, + quant_config=quant_config) + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb) + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen2_5_VisionPatchEmbed(nn.Module): + + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_chans: int = 3, + hidden_size: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.hidden_size = hidden_size + + kernel_size = [temporal_patch_size, patch_size, patch_size] + self.proj = nn.Conv3d(in_chans, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) + x = self.proj(x).view(L, self.hidden_size) + return x + + +class Qwen2_5_VisionPatchMerger(nn.Module): + + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Type[nn.Module] = None, + spatial_merge_size: int = 2, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.ln_q = norm_layer(context_dim) + self.mlp = nn.ModuleList([ + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config), + nn.GELU(), + RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config), + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.ln_q(x) + x = x.view(-1, self.hidden_size) + + mlp_fc1, mlp_act, mlp_fc2 = self.mlp + x_parallel, _ = mlp_fc1(x) + x_parallel = mlp_act(x_parallel) + out, _ = mlp_fc2(x_parallel) + return out + + +class Qwen2_5_VisionRotaryEmbedding(nn.Module): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._freqs_cached = None + + def update_freqs_cache(self, seqlen: int) -> None: + if seqlen > self._seq_len_cached: + seqlen *= 2 + self._seq_len_cached = seqlen + self.inv_freq = 1.0 / (self.theta**(torch.arange( + 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) + / self.dim)) + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + self._freqs_cached = freqs + + def forward(self, seqlen: int) -> torch.Tensor: + self.update_freqs_cache(seqlen) + return self._freqs_cached[:seqlen] + + +class Qwen2_5_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + patch_size: int = vision_config.patch_size + temporal_patch_size: int = vision_config.temporal_patch_size + spatial_merge_size: int = vision_config.spatial_merge_size + in_channels: int = vision_config.in_channels + hidden_size: int = vision_config.hidden_size + depth: int = vision_config.depth + num_heads: int = vision_config.num_heads + self.spatial_merge_size = spatial_merge_size + self.window_size = vision_config.window_size + self.patch_size = patch_size + self.spatial_merge_unit = self.spatial_merge_size**2 + self.fullatt_block_indexes = vision_config.fullatt_block_indexes + + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_chans=in_channels, + hidden_size=hidden_size, + ) + norm_layer = partial(RMSNorm, eps=norm_eps) + head_dim = hidden_size // num_heads + self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + Qwen2_5_VisionBlock( + dim=hidden_size, + num_heads=num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + norm_layer=norm_layer, + quant_config=quant_config, + ) for layer_idx in range(depth) + ]) + self.merger = Qwen2_5_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + ) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_window_index(self, grid_thw): + window_index: list = [] + cu_window_seqlens: list = [0] + window_index_id = 0 + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + cu_seqlens_tmp = seqlens.cumsum( + 0) * self.spatial_merge_unit + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # patchify + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # windows attention + window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=hidden_states.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + rotary_pos_emb = rotary_pos_emb[window_index, :, :] + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + + # transformers + hidden_states = hidden_states.unsqueeze(1) + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb=rotary_pos_emb) + + # adapter + hidden_states = self.merger(hidden_states) + reverse_indices = torch.argsort(window_index) + hidden_states = hidden_states[reverse_indices, :] + return hidden_states + + +# === Vision input helpers === # + +cached_get_processor = lru_cache(get_processor) + + +def mm_input_mapper_for_qwen2_5_vl( + ctx: InputContext, + data: MultiModalData[object], + data_type_key: str, +) -> MultiModalInputs: + """Input mapper for Qwen2-VL.""" + if data_type_key == "image" and isinstance(data, dict): + return MultiModalInputs({ + "image_embeds": data.get("image_embeds"), + "image_grid_thw": data.get("image_grid_thw"), + }) + model_config = ctx.model_config + image_processor = cached_get_image_processor( + model_config.model, trust_remote_code=model_config.trust_remote_code) + if image_processor is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + + images = None + videos = None + if data_type_key == "image": + images = data + else: + assert data_type_key == "video" + videos = data + + try: + batch_data = image_processor \ + .preprocess(images=images, videos=videos, return_tensors="pt") \ + .data + except Exception: + logger.error("Failed to process image (%s)", data) + raise + + return MultiModalInputs(batch_data) + + +image_input_mapper_for_qwen2_5_vl = partial(mm_input_mapper_for_qwen2_5_vl, + data_type_key="image") +video_input_mapper_for_qwen2_5_vl = partial(mm_input_mapper_for_qwen2_5_vl, + data_type_key="video") + + +def _get_vision_info( + image_processor, + height: int, + width: int, + min_pixels: int, + max_pixels: int, + do_resize: bool = True, + data_type_key: str = "image", + mm_count: int = 1, +): + """Get information (resized height / width and number of vision tokens) + of input image / video frame.""" + + if do_resize: + resized_height, resized_width = smart_resize( + height=height, + width=width, + factor=image_processor.patch_size * image_processor.merge_size, + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + else: + resized_height, resized_width = height, width + + if data_type_key == "image": + grid_t = mm_count + else: + assert data_type_key == "video" + grid_t = max(mm_count // image_processor.temporal_patch_size, 1) + + grid_h = resized_height // image_processor.patch_size + grid_w = resized_width // image_processor.patch_size + vision_tokens = grid_t * grid_h * grid_w + llm_num_vision_tokens = (vision_tokens // image_processor.merge_size // + image_processor.merge_size) + + return resized_height, resized_width, llm_num_vision_tokens + + +def _get_max_image_info( + image_processor, + data_type_key: str = "image", + mm_count: int = 1, +): + return _get_vision_info( + image_processor, + height=9999999, + width=9999999, + + # Limit min / max pixels. + min_pixels=max(image_processor.min_pixels, 28 * 28), + max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28), + data_type_key=data_type_key, + mm_count=mm_count, + ) + + +def get_max_qwen2_5_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: + image_processor = cached_get_image_processor(ctx.model_config.model) + max_resized_height, max_resized_width, max_llm_image_tokens = \ + _get_max_image_info(image_processor, data_type_key=data_type_key, + mm_count=1) + return max_llm_image_tokens + + +get_max_qwen2_5_vl_image_tokens = partial(get_max_qwen2_5_vl_mm_tokens, + data_type_key="image") +get_max_qwen2_5_vl_video_tokens = partial(get_max_qwen2_5_vl_mm_tokens, + data_type_key="video") + + +def dummy_data_for_qwen2_5_vl( + ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int] +) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: + image_processor = cached_get_image_processor(ctx.model_config.model) + + num_images = mm_counts["image"] + max_resized_height, max_resized_width, max_llm_image_tokens = \ + _get_max_image_info(image_processor, data_type_key="image", + mm_count=num_images) + if seq_len - max_llm_image_tokens - 2 < 0: + raise RuntimeError( + f"Qwen2-VL cannot process {num_images} images in a prompt, " + "please increase max_model_len or reduce image limit by " + "--limit-mm-per-prompt.") + + # Check video counts. + num_videos = mm_counts["video"] + max_resized_height, max_resized_width, max_llm_video_tokens = \ + _get_max_image_info(image_processor, data_type_key="video", + mm_count=num_videos) + if seq_len - max_llm_video_tokens - 2 < 0: + raise RuntimeError( + f"Qwen2-VL cannot process {num_images} videos in a prompt, " + "please increase max_model_len or reduce video limit by " + "--limit-mm-per-prompt.") + + hf_config = ctx.get_hf_config(Qwen2_5_VLConfig) + + dummy_seqdata = SequenceData.from_token_counts( + (hf_config.vision_start_token_id, 1), + (hf_config.image_token_id, max_llm_image_tokens), + (hf_config.vision_end_token_id, 1), + (0, seq_len - max_llm_image_tokens - 2), + ) + + dummy_image = Image.new("RGB", (max_resized_width, max_resized_height), + color=0) + + return dummy_seqdata, { + "image": dummy_image if num_images == 1 else [dummy_image] * num_images + } + + +def _get_llm_num_vision_tokens( + mm_inputs: list, + data_type_key: str, + image_processor, +): + """Get number of vision tokens of multimodal inputs. + + This method is derived from `transformers.models.qwen2_vl. + image_processing_qwen2_vl.Qwen2_5_VLImageProcessor._preprocess`. + """ + image = to_numpy_array(mm_inputs[0]) + input_data_format = infer_channel_dimension_format(image) + height, width = get_image_size(image, channel_dim=input_data_format) + _, _, llm_num_vision_tokens = _get_vision_info( + image_processor, + height=height, + width=width, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + do_resize=image_processor.do_resize, + data_type_key=data_type_key, + mm_count=len(mm_inputs), + ) + return llm_num_vision_tokens + + +def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, + data_type_key: str, image_processor: Any, + prompt_token_ids: List[int]) -> List[int]: + """ + Expand pad tokens for multi-modal inputs (e.g., images or videos). + + Args: + inputs (list): The multi-modal inputs (e.g., images or videos). + token_id (int): The token ID used to represent the multi-modal input. + make_batched_fn (Callable): A function to batch the inputs. + data_type_key (str): The type of the multi-modal input. + image_processor (Any): The image processor used to process the inputs. + prompt_token_ids (List[int]): The list of token IDs in the prompt. + + Returns: + List[int]: The list of token IDs for the multi-modal inputs. + """ + indices = [ + idx for idx, token in enumerate(prompt_token_ids) if token == token_id + ] + inputs = make_batched_fn(inputs) + assert len(indices) == len(inputs) + + prompt_token_ids_with_data = [] + for cnt, data in enumerate(inputs): + num_tokens = _get_llm_num_vision_tokens( + [data] if data_type_key == "image" else data, + data_type_key=data_type_key, + image_processor=image_processor, + ) + if cnt == 0: + end_idx = indices[cnt] + non_data_tokens = prompt_token_ids[:end_idx] + else: + non_data_tokens = prompt_token_ids[indices[cnt - 1] + + 1:indices[cnt]] + prompt_token_ids_with_data.extend(non_data_tokens) + prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens)) + prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:]) + return prompt_token_ids_with_data + + +def input_processor_for_qwen2_5_vl(ctx: InputContext, + llm_inputs: LLMInputs) -> LLMInputs: + multi_modal_data = llm_inputs.get("multi_modal_data", None) + if multi_modal_data is None: + return llm_inputs + + image_inputs = multi_modal_data.get("image", None) + video_inputs = multi_modal_data.get("video", None) + + processor = cached_get_processor(ctx.model_config.model) + image_processor = processor.image_processor + hf_config = ctx.get_hf_config(Qwen2_5_VLConfig) + + # To avoid redundant processing of vision objects (resize, rescale, etc.), + # we extract code of calculating number of vision tokens from + # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2_5_VLProcessor`. + # + # The following code is equivalent to: + # prompt = llm_inputs["prompt"] + # inputs = processor(text=[prompt], + # images=image_inputs, + # videos=video_inputs, + # padding=True, + # return_tensors="pt") + # prompt_token_ids = inputs["input_ids"][0].tolist() + + prompt_token_ids = llm_inputs.get("prompt_token_ids", None) + if prompt_token_ids is None: + prompt = llm_inputs["prompt"] + prompt_token_ids = processor.tokenizer( + prompt, + padding=True, + return_tensors=None, + )["input_ids"] + + # Expand image pad tokens. + + if image_inputs is not None: + if isinstance(image_inputs, dict): + prompt_token_ids_with_image = [] + image_indices = [ + idx for idx, token in enumerate(prompt_token_ids) + if token == hf_config.image_token_id + ] + image_cnt = len(image_indices) + embed_dim = image_inputs.get('image_embeds').size(0) + assert embed_dim % image_cnt == 0 + num_pad_tokens = embed_dim // image_cnt + for idx, token in enumerate(prompt_token_ids): + if idx in image_indices: + prompt_token_ids_with_image.extend([token] * + num_pad_tokens) + else: + prompt_token_ids_with_image.append(token) + prompt_token_ids = prompt_token_ids_with_image + else: + prompt_token_ids = _expand_pad_tokens(image_inputs, + hf_config.image_token_id, + make_batched_images, "image", + image_processor, + prompt_token_ids) + + if video_inputs is not None: + prompt_token_ids = _expand_pad_tokens(video_inputs, + hf_config.video_token_id, + make_batched_videos, "video", + image_processor, + prompt_token_ids) + + return LLMInputs( + prompt_token_ids=prompt_token_ids, + prompt=llm_inputs["prompt"], + multi_modal_data=multi_modal_data, + ) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper( + image_input_mapper_for_qwen2_5_vl) +@MULTIMODAL_REGISTRY.register_input_mapper("video", + video_input_mapper_for_qwen2_5_vl) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_5_vl_image_tokens) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( + "video", get_max_qwen2_5_vl_video_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_5_vl) +@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_5_vl) +class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP): + + def __init__(self, + config: Qwen2_5_VLConfig, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + + assert not cache_config.enable_prefix_caching, \ + "Qwen2-VL currently does not support prefix caching" + + self.config = config + self.multimodal_config = multimodal_config + + self.visual = Qwen2_5_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + + # NOTE: Qwen2-5-VL vision encoder does not support any + # quantization method now. + quant_config=None, + ) + + self.model = Qwen2Model(config, cache_config, quant_config) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def _validate_and_reshape_mm_tensor(self, + mm_input: Union[torch.Tensor, + List[torch.Tensor]], + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim}") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return Qwen2_5_VLImagePixelInputs(type="pixel_values", + data=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Qwen2_5_VLImageEmbeddingInputs(type="image_embeds", + data=image_embeds) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + + if pixel_values_videos is None: + return None + + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2_5_VLVideoInputs( + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + def _process_image_input(self, + image_input: Qwen2_5_VLImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"].type(self.visual.dtype) + + pixel_values = image_input["data"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, + grid_thw=image_input["image_grid_thw"]) + return image_embeds + + def _process_video_input(self, + video_input: Qwen2_5_VLVideoInputs) -> torch.Tensor: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, + grid_thw=video_input["video_grid_thw"]) + return video_embeds + + def _merge_multimodal_embeddings( + self, + input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + multimodal_embeddings: torch.Tensor, + placeholder_token_id: int, + ) -> torch.Tensor: + mask = (input_ids == placeholder_token_id) + inputs_embeds[mask, :] = multimodal_embeddings + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + """Run forward pass for Qwen2-VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen2-VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. + `None` if no images are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. + `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. + `None` if no videos are passed. + """ + if intermediate_tensors is not None: + input_ids = None + inputs_embeds = None + else: + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + rope_scaling = getattr(self.config, "rope_scaling", {}) + if rope_scaling.get("type", None) == "mrope": + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + + inputs_embeds = self.model.embed_tokens(input_ids) + + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = self._merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = self._merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + + input_ids = None + + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "up_proj", 1), + ("gate_up_proj", "gate_proj", 0), + ] + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.config.tie_word_embeddings and "lm_head.weight" in name: + continue + is_visual_gate_up = "visual" in name and ("gate_proj" in name or "up_proj" in name) + if is_visual_gate_up: + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + try: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + except KeyError: + raise ValueError(f"Unexpected weight: {name}") from None + + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 87a08b2..3f63c5f 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -136,11 +136,11 @@ class Qwen3Attention(nn.Module): # Add qk-norm q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim) - q_by_head = self.q_norm.forward_native(q_by_head) + q_by_head = self.q_norm.forward_cuda(q_by_head.contiguous()) q = q_by_head.view(q.shape) k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim) - k_by_head = self.k_norm.forward_native(k_by_head) + k_by_head = self.k_norm.forward_cuda(k_by_head.contiguous()) k = k_by_head.view(k.shape) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 73ae906..a2d063c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -114,6 +114,7 @@ _MULTIMODAL_MODELS = { "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), # [Encoder-decoder] "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 diff --git a/vllm/multimodal/__pycache__/__init__.cpython-310.pyc b/vllm/multimodal/__pycache__/__init__.cpython-310.pyc index 6add128..ab4b707 100644 Binary files a/vllm/multimodal/__pycache__/__init__.cpython-310.pyc and b/vllm/multimodal/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/multimodal/__pycache__/audio.cpython-310.pyc b/vllm/multimodal/__pycache__/audio.cpython-310.pyc index 8b05e28..a1bbd41 100644 Binary files a/vllm/multimodal/__pycache__/audio.cpython-310.pyc and b/vllm/multimodal/__pycache__/audio.cpython-310.pyc differ diff --git a/vllm/multimodal/__pycache__/base.cpython-310.pyc b/vllm/multimodal/__pycache__/base.cpython-310.pyc index db52c36..b2ae847 100644 Binary files a/vllm/multimodal/__pycache__/base.cpython-310.pyc and b/vllm/multimodal/__pycache__/base.cpython-310.pyc differ diff --git a/vllm/multimodal/__pycache__/image.cpython-310.pyc b/vllm/multimodal/__pycache__/image.cpython-310.pyc index 0790982..d5898d2 100644 Binary files a/vllm/multimodal/__pycache__/image.cpython-310.pyc and b/vllm/multimodal/__pycache__/image.cpython-310.pyc differ diff --git a/vllm/multimodal/__pycache__/registry.cpython-310.pyc b/vllm/multimodal/__pycache__/registry.cpython-310.pyc index 98fe724..350ec50 100644 Binary files a/vllm/multimodal/__pycache__/registry.cpython-310.pyc and b/vllm/multimodal/__pycache__/registry.cpython-310.pyc differ diff --git a/vllm/multimodal/__pycache__/utils.cpython-310.pyc b/vllm/multimodal/__pycache__/utils.cpython-310.pyc index 9e3b55c..d04d3df 100644 Binary files a/vllm/multimodal/__pycache__/utils.cpython-310.pyc and b/vllm/multimodal/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/multimodal/__pycache__/video.cpython-310.pyc b/vllm/multimodal/__pycache__/video.cpython-310.pyc index 6a318a2..e2d3163 100644 Binary files a/vllm/multimodal/__pycache__/video.cpython-310.pyc and b/vllm/multimodal/__pycache__/video.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/__init__.cpython-310.pyc b/vllm/platforms/__pycache__/__init__.cpython-310.pyc index 07fb96c..0b50dc4 100644 Binary files a/vllm/platforms/__pycache__/__init__.cpython-310.pyc and b/vllm/platforms/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/cpu.cpython-310.pyc b/vllm/platforms/__pycache__/cpu.cpython-310.pyc index 0dcd79b..49e4ed2 100644 Binary files a/vllm/platforms/__pycache__/cpu.cpython-310.pyc and b/vllm/platforms/__pycache__/cpu.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/cuda.cpython-310.pyc b/vllm/platforms/__pycache__/cuda.cpython-310.pyc index 559a358..6c4935a 100644 Binary files a/vllm/platforms/__pycache__/cuda.cpython-310.pyc and b/vllm/platforms/__pycache__/cuda.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/interface.cpython-310.pyc b/vllm/platforms/__pycache__/interface.cpython-310.pyc index 9e971de..053ec4f 100644 Binary files a/vllm/platforms/__pycache__/interface.cpython-310.pyc and b/vllm/platforms/__pycache__/interface.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/rocm.cpython-310.pyc b/vllm/platforms/__pycache__/rocm.cpython-310.pyc index 33cda23..b1aab34 100644 Binary files a/vllm/platforms/__pycache__/rocm.cpython-310.pyc and b/vllm/platforms/__pycache__/rocm.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/tpu.cpython-310.pyc b/vllm/platforms/__pycache__/tpu.cpython-310.pyc index 84d31fe..142ab4d 100644 Binary files a/vllm/platforms/__pycache__/tpu.cpython-310.pyc and b/vllm/platforms/__pycache__/tpu.cpython-310.pyc differ diff --git a/vllm/platforms/__pycache__/xpu.cpython-310.pyc b/vllm/platforms/__pycache__/xpu.cpython-310.pyc index 87134d7..b5ebb41 100644 Binary files a/vllm/platforms/__pycache__/xpu.cpython-310.pyc and b/vllm/platforms/__pycache__/xpu.cpython-310.pyc differ diff --git a/vllm/plugins/__pycache__/__init__.cpython-310.pyc b/vllm/plugins/__pycache__/__init__.cpython-310.pyc index fb5aa1d..01e729b 100644 Binary files a/vllm/plugins/__pycache__/__init__.cpython-310.pyc and b/vllm/plugins/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc index b77191e..a0d7d61 100644 Binary files a/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc index abe0d5f..8291d18 100644 Binary files a/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc index fb21ff6..bfb83a6 100644 Binary files a/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc index 4cbd187..29f2651 100644 Binary files a/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc index a143fd4..0e78e38 100644 Binary files a/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc index 3cc1bba..24b2c33 100644 Binary files a/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc differ diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 4cde2a0..aae81de 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -90,6 +90,6 @@ def load_peft_weights(model_id: str, adapters_weights = safe_load_file(filename, device=device) else: adapters_weights = torch.load(filename, - map_location=torch.device(device)) + map_location=torch.device(device), weights_only=True) return adapters_weights diff --git a/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc b/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc index 213d9b8..bccb28a 100644 Binary files a/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc and b/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc b/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc index c07646f..6c3fbc2 100644 Binary files a/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc and b/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc b/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc index a298a18..50e0db8 100644 Binary files a/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc and b/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc b/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc index ed50914..9a5a065 100644 Binary files a/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc and b/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc index d887416..325c060 100644 Binary files a/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc b/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc index f37fc63..9518ee3 100644 Binary files a/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc and b/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc index da7501c..5e4f97d 100644 Binary files a/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc b/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc index 344f6a5..3a689b4 100644 Binary files a/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc and b/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc index 3eae3f9..7ad4533 100644 Binary files a/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc index 0da7d4e..dfc4758 100644 Binary files a/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc b/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc index 2b3eb51..6d326eb 100644 Binary files a/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc and b/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc index c47e835..c8bd07f 100644 Binary files a/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc index a4e914a..dae197d 100644 Binary files a/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc b/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc index af40176..dcefa2f 100644 Binary files a/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc and b/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc b/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc index 0e72f46..d1f72a3 100644 Binary files a/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc and b/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc differ diff --git a/vllm/spec_decode/__pycache__/util.cpython-310.pyc b/vllm/spec_decode/__pycache__/util.cpython-310.pyc index 2f06c7f..1a4ce6b 100644 Binary files a/vllm/spec_decode/__pycache__/util.cpython-310.pyc and b/vllm/spec_decode/__pycache__/util.cpython-310.pyc differ diff --git a/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc index 8b699dd..628a03d 100644 Binary files a/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/transformers_utils/__pycache__/config.cpython-310.pyc b/vllm/transformers_utils/__pycache__/config.cpython-310.pyc index 852d8f1..532911c 100644 Binary files a/vllm/transformers_utils/__pycache__/config.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/config.cpython-310.pyc differ diff --git a/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc b/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc index cb27203..6812464 100644 Binary files a/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc differ diff --git a/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc b/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc index a535b54..92197be 100644 Binary files a/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc differ diff --git a/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc b/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc index 60b4473..191d284 100644 Binary files a/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc differ diff --git a/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc b/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc index 77fb60c..4dc959c 100644 Binary files a/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc index 400b5e3..52a9ae4 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc index 84f5429..492608d 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc index 5b934d3..b8baa1b 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc index 2f8b67a..f014576 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc index 39a4a50..be8b08a 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc index 161adbc..c39474d 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc index aab95b5..49f1625 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc index 02d777a..8477890 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc index 851452d..777501a 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc index dd42292..ad40fe5 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc index 6fcf925..e247025 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc index a807a6d..8ce7a6d 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc index 18a7758..07ea934 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc index d8b8e04..693b3ca 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc index 3e46783..f9cee42 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc index 6d53853..32491a1 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc index 72f652c..17a155c 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc differ diff --git a/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc index 2385a07..103564d 100644 Binary files a/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc index 65433db..f7ff6e4 100644 Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc index 63750e8..e107728 100644 Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc index 43c7d5f..5c3bbcc 100644 Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc index 4554869..0d35ced 100644 Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc index e4bba79..1e6b3a2 100644 Binary files a/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc b/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc index 27bceff..c3399cc 100644 Binary files a/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc and b/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc differ diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index aae10d3..b7e33ae 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,5 +1,5 @@ import os -import re +import regex as re from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 4e19581..80f7a32 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -2,8 +2,8 @@ from vllm.triton_utils.importing import HAS_TRITON __all__ = ["HAS_TRITON"] -#from vllm.triton_utils.custom_cache_manager import ( -# maybe_set_triton_cache_manager) -#from vllm.triton_utils.libentry import libentry +from vllm.triton_utils.custom_cache_manager import ( + maybe_set_triton_cache_manager) +from vllm.triton_utils.libentry import libentry __all__ += ["maybe_set_triton_cache_manager", "libentry"] diff --git a/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc b/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc index 9be72bc..1ddb711 100644 Binary files a/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc and b/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc b/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc index 63d5cd3..6359ac7 100644 Binary files a/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc and b/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc differ diff --git a/vllm/triton_utils/__pycache__/importing.cpython-310.pyc b/vllm/triton_utils/__pycache__/importing.cpython-310.pyc index 81c44b1..1a18938 100644 Binary files a/vllm/triton_utils/__pycache__/importing.cpython-310.pyc and b/vllm/triton_utils/__pycache__/importing.cpython-310.pyc differ diff --git a/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc b/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc index 5e74d2b..2aa05a1 100644 Binary files a/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc and b/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc differ diff --git a/vllm/usage/__pycache__/__init__.cpython-310.pyc b/vllm/usage/__pycache__/__init__.cpython-310.pyc index 9320f07..2ebb663 100644 Binary files a/vllm/usage/__pycache__/__init__.cpython-310.pyc and b/vllm/usage/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/usage/__pycache__/usage_lib.cpython-310.pyc b/vllm/usage/__pycache__/usage_lib.cpython-310.pyc index 4c0bd29..b7990bf 100644 Binary files a/vllm/usage/__pycache__/usage_lib.cpython-310.pyc and b/vllm/usage/__pycache__/usage_lib.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/__init__.cpython-310.pyc b/vllm/worker/__pycache__/__init__.cpython-310.pyc index 9f2330d..625607b 100644 Binary files a/vllm/worker/__pycache__/__init__.cpython-310.pyc and b/vllm/worker/__pycache__/__init__.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/cache_engine.cpython-310.pyc b/vllm/worker/__pycache__/cache_engine.cpython-310.pyc index ba2d296..3facbcf 100644 Binary files a/vllm/worker/__pycache__/cache_engine.cpython-310.pyc and b/vllm/worker/__pycache__/cache_engine.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc index 29b091e..b6a4074 100644 Binary files a/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc index 44e5a90..b897138 100644 Binary files a/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc index c85339b..65c86b6 100644 Binary files a/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc index 3803bc6..336c63b 100644 Binary files a/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc index b125020..3f44f9d 100644 Binary files a/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/model_runner.cpython-310.pyc b/vllm/worker/__pycache__/model_runner.cpython-310.pyc index a8cef01..b85fb28 100644 Binary files a/vllm/worker/__pycache__/model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc b/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc index 3706030..41fe8a4 100644 Binary files a/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc and b/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc index 6a57eb8..b05eb79 100644 Binary files a/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc index 1ef4b45..fabe7d2 100644 Binary files a/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc b/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc index c55ecd0..6144413 100644 Binary files a/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc and b/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc index 3740c52..f69ffee 100644 Binary files a/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc b/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc index 5fac9ad..9a00b6b 100644 Binary files a/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc and b/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc index 6237b1c..335c0e0 100644 Binary files a/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc b/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc index f6fa42e..87ac044 100644 Binary files a/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc and b/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc index 3a70092..6ee4b27 100644 Binary files a/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc index e76dea5..0b963f6 100644 Binary files a/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/utils.cpython-310.pyc b/vllm/worker/__pycache__/utils.cpython-310.pyc index a32a86a..6e96466 100644 Binary files a/vllm/worker/__pycache__/utils.cpython-310.pyc and b/vllm/worker/__pycache__/utils.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/worker.cpython-310.pyc b/vllm/worker/__pycache__/worker.cpython-310.pyc index 2ea3809..b65ca57 100644 Binary files a/vllm/worker/__pycache__/worker.cpython-310.pyc and b/vllm/worker/__pycache__/worker.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/worker_base.cpython-310.pyc b/vllm/worker/__pycache__/worker_base.cpython-310.pyc index b3bdfef..a1aec6a 100644 Binary files a/vllm/worker/__pycache__/worker_base.cpython-310.pyc and b/vllm/worker/__pycache__/worker_base.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc index 8130605..ee9897b 100644 Binary files a/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc differ diff --git a/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc index 7b6f418..c4cc9e5 100644 Binary files a/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc differ