diff --git a/Dockerfile b/Dockerfile
index 62b7be9..51d3df7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,15 @@
-FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.1-x86-ubuntu20.04-py3.10-poc-llm-infer:20250731115755
+FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.3-x86-ubuntu20.04-py3.10-poc-llm-infer:v1.2.3
 
 RUN pip install --no-cache-dir triton==2.1.0
 
 COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton
 COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info
-COPY pkgs/xformers-0.0.22+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/xformers-0.0.22+corex.4.1.2.dist-info
-COPY pkgs/xformers /usr/local/corex/lib64/python3/dist-packages/xformers
 
-COPY paged_attn.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/paged_attn.py
-COPY __init__.py /usr/local/lib/python3.10/site-packages/vllm/triton_utils/__init__.py
-COPY prefix_prefill.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/prefix_prefill.py
+COPY paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
+COPY __init__.py /usr/local/corex/lib64/python3/dist-packages/vllm/triton_utils/__init__.py
 
 RUN mkdir /workspace
 WORKDIR /workspace/
 
 COPY ./launch_service /workspace/launch_service
 
-ENTRYPOINT ["./launch_service"]
diff --git a/paged_attn.py b/paged_attn.py
index 1741dd1..988f903 100644
--- a/paged_attn.py
+++ b/paged_attn.py
@@ -4,6 +4,7 @@ from typing import List, Optional, Tuple
 import torch
 
 from vllm import _custom_ops as ops
+
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
diff --git a/vllm/__pycache__/__init__.cpython-310.pyc b/vllm/__pycache__/__init__.cpython-310.pyc
index 1190aa9..ba64550 100644
Binary files a/vllm/__pycache__/__init__.cpython-310.pyc and b/vllm/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/__pycache__/_core_ext.cpython-310.pyc b/vllm/__pycache__/_core_ext.cpython-310.pyc
index aaa4f73..3c7a3da 100644
Binary files a/vllm/__pycache__/_core_ext.cpython-310.pyc and b/vllm/__pycache__/_core_ext.cpython-310.pyc differ
diff --git a/vllm/__pycache__/_custom_ops.cpython-310.pyc b/vllm/__pycache__/_custom_ops.cpython-310.pyc
index 4f0ece7..fc17fb1 100644
Binary files a/vllm/__pycache__/_custom_ops.cpython-310.pyc and b/vllm/__pycache__/_custom_ops.cpython-310.pyc differ
diff --git a/vllm/__pycache__/_ipex_ops.cpython-310.pyc b/vllm/__pycache__/_ipex_ops.cpython-310.pyc
index 69c6c0c..afea7b9 100644
Binary files a/vllm/__pycache__/_ipex_ops.cpython-310.pyc and b/vllm/__pycache__/_ipex_ops.cpython-310.pyc differ
diff --git a/vllm/__pycache__/beam_search.cpython-310.pyc b/vllm/__pycache__/beam_search.cpython-310.pyc
index cad0282..4e88877 100644
Binary files a/vllm/__pycache__/beam_search.cpython-310.pyc and b/vllm/__pycache__/beam_search.cpython-310.pyc differ
diff --git a/vllm/__pycache__/block.cpython-310.pyc b/vllm/__pycache__/block.cpython-310.pyc
index c27aa2b..2ec9664 100644
Binary files a/vllm/__pycache__/block.cpython-310.pyc and b/vllm/__pycache__/block.cpython-310.pyc differ
diff --git a/vllm/__pycache__/config.cpython-310.pyc b/vllm/__pycache__/config.cpython-310.pyc
index e54d5ee..7f93840 100644
Binary files a/vllm/__pycache__/config.cpython-310.pyc and b/vllm/__pycache__/config.cpython-310.pyc differ
diff --git a/vllm/__pycache__/connections.cpython-310.pyc b/vllm/__pycache__/connections.cpython-310.pyc
index d662859..977dbba 100644
Binary files a/vllm/__pycache__/connections.cpython-310.pyc and b/vllm/__pycache__/connections.cpython-310.pyc differ
diff --git a/vllm/__pycache__/envs.cpython-310.pyc b/vllm/__pycache__/envs.cpython-310.pyc
index bde38c9..223d6c0 100644
Binary files a/vllm/__pycache__/envs.cpython-310.pyc and b/vllm/__pycache__/envs.cpython-310.pyc differ
diff --git a/vllm/__pycache__/forward_context.cpython-310.pyc b/vllm/__pycache__/forward_context.cpython-310.pyc
index 539b909..135a5b3 100644
Binary files a/vllm/__pycache__/forward_context.cpython-310.pyc and b/vllm/__pycache__/forward_context.cpython-310.pyc differ
diff --git a/vllm/__pycache__/logger.cpython-310.pyc b/vllm/__pycache__/logger.cpython-310.pyc
index 15c9df2..5009ecf 100644
Binary files a/vllm/__pycache__/logger.cpython-310.pyc and b/vllm/__pycache__/logger.cpython-310.pyc differ
diff --git a/vllm/__pycache__/outputs.cpython-310.pyc b/vllm/__pycache__/outputs.cpython-310.pyc
index 448d4fe..932e79b 100644
Binary files a/vllm/__pycache__/outputs.cpython-310.pyc and b/vllm/__pycache__/outputs.cpython-310.pyc differ
diff --git a/vllm/__pycache__/pooling_params.cpython-310.pyc b/vllm/__pycache__/pooling_params.cpython-310.pyc
index 05d2402..da6c1ba 100644
Binary files a/vllm/__pycache__/pooling_params.cpython-310.pyc and b/vllm/__pycache__/pooling_params.cpython-310.pyc differ
diff --git a/vllm/__pycache__/sampling_params.cpython-310.pyc b/vllm/__pycache__/sampling_params.cpython-310.pyc
index 31c99bc..371575b 100644
Binary files a/vllm/__pycache__/sampling_params.cpython-310.pyc and b/vllm/__pycache__/sampling_params.cpython-310.pyc differ
diff --git a/vllm/__pycache__/scalar_type.cpython-310.pyc b/vllm/__pycache__/scalar_type.cpython-310.pyc
index 8aa5fca..5f2f2da 100644
Binary files a/vllm/__pycache__/scalar_type.cpython-310.pyc and b/vllm/__pycache__/scalar_type.cpython-310.pyc differ
diff --git a/vllm/__pycache__/scripts.cpython-310.pyc b/vllm/__pycache__/scripts.cpython-310.pyc
index 45eb857..3d8abe2 100644
Binary files a/vllm/__pycache__/scripts.cpython-310.pyc and b/vllm/__pycache__/scripts.cpython-310.pyc differ
diff --git a/vllm/__pycache__/sequence.cpython-310.pyc b/vllm/__pycache__/sequence.cpython-310.pyc
index ca6591b..de03038 100644
Binary files a/vllm/__pycache__/sequence.cpython-310.pyc and b/vllm/__pycache__/sequence.cpython-310.pyc differ
diff --git a/vllm/__pycache__/tracing.cpython-310.pyc b/vllm/__pycache__/tracing.cpython-310.pyc
index b4dd2a9..0914d51 100644
Binary files a/vllm/__pycache__/tracing.cpython-310.pyc and b/vllm/__pycache__/tracing.cpython-310.pyc differ
diff --git a/vllm/__pycache__/utils.cpython-310.pyc b/vllm/__pycache__/utils.cpython-310.pyc
index 3b90b20..02651de 100644
Binary files a/vllm/__pycache__/utils.cpython-310.pyc and b/vllm/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/__pycache__/version.cpython-310.pyc b/vllm/__pycache__/version.cpython-310.pyc
index de6fe1d..42f846b 100644
Binary files a/vllm/__pycache__/version.cpython-310.pyc and b/vllm/__pycache__/version.cpython-310.pyc differ
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ac4cce9..64a5534 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1102,4 +1102,4 @@ for k, v in names_and_values.items():
         names_and_values_to_update[k] = hint_on_error(v)
 
 names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type
+del names_and_values_to_update, names_and_values, v, k, fn_type
\ No newline at end of file
diff --git a/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc b/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc
index e3d7395..10fe168 100644
Binary files a/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc b/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc
index 58eba13..e94d5a8 100644
Binary files a/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/layers.cpython-310.pyc differ
diff --git a/vllm/adapter_commons/__pycache__/models.cpython-310.pyc b/vllm/adapter_commons/__pycache__/models.cpython-310.pyc
index c430e4e..58b2682 100644
Binary files a/vllm/adapter_commons/__pycache__/models.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/models.cpython-310.pyc differ
diff --git a/vllm/adapter_commons/__pycache__/request.cpython-310.pyc b/vllm/adapter_commons/__pycache__/request.cpython-310.pyc
index 46b7e60..c5beddc 100644
Binary files a/vllm/adapter_commons/__pycache__/request.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/request.cpython-310.pyc differ
diff --git a/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc b/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc
index bd20645..9a9082d 100644
Binary files a/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc b/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc
index 0637c3c..d00fc5c 100644
Binary files a/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc and b/vllm/adapter_commons/__pycache__/worker_manager.cpython-310.pyc differ
diff --git a/vllm/assets/__pycache__/__init__.cpython-310.pyc b/vllm/assets/__pycache__/__init__.cpython-310.pyc
index 6f5c7c6..6d8da66 100644
Binary files a/vllm/assets/__pycache__/__init__.cpython-310.pyc and b/vllm/assets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/assets/__pycache__/audio.cpython-310.pyc b/vllm/assets/__pycache__/audio.cpython-310.pyc
index 4a4f7a8..f0638ff 100644
Binary files a/vllm/assets/__pycache__/audio.cpython-310.pyc and b/vllm/assets/__pycache__/audio.cpython-310.pyc differ
diff --git a/vllm/assets/__pycache__/base.cpython-310.pyc b/vllm/assets/__pycache__/base.cpython-310.pyc
index a5a838c..2976baa 100644
Binary files a/vllm/assets/__pycache__/base.cpython-310.pyc and b/vllm/assets/__pycache__/base.cpython-310.pyc differ
diff --git a/vllm/assets/__pycache__/image.cpython-310.pyc b/vllm/assets/__pycache__/image.cpython-310.pyc
index 644befd..7abde64 100644
Binary files a/vllm/assets/__pycache__/image.cpython-310.pyc and b/vllm/assets/__pycache__/image.cpython-310.pyc differ
diff --git a/vllm/assets/__pycache__/video.cpython-310.pyc b/vllm/assets/__pycache__/video.cpython-310.pyc
index 6d2fbac..098a5a1 100644
Binary files a/vllm/assets/__pycache__/video.cpython-310.pyc and b/vllm/assets/__pycache__/video.cpython-310.pyc differ
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 5eec78c..8e30381 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -27,4 +27,4 @@ class ImageAsset:
         """
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                             s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, weights_only=True)
diff --git a/vllm/attention/__pycache__/__init__.cpython-310.pyc b/vllm/attention/__pycache__/__init__.cpython-310.pyc
index 080c3e8..99ccc3b 100644
Binary files a/vllm/attention/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/attention/__pycache__/layer.cpython-310.pyc b/vllm/attention/__pycache__/layer.cpython-310.pyc
index 29f52c1..38235bb 100644
Binary files a/vllm/attention/__pycache__/layer.cpython-310.pyc and b/vllm/attention/__pycache__/layer.cpython-310.pyc differ
diff --git a/vllm/attention/__pycache__/selector.cpython-310.pyc b/vllm/attention/__pycache__/selector.cpython-310.pyc
index e6fdecb..382e7b5 100644
Binary files a/vllm/attention/__pycache__/selector.cpython-310.pyc and b/vllm/attention/__pycache__/selector.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc b/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc
index 19f3f54..c61e24b 100644
Binary files a/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc b/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc
index bb16759..ecf40f5 100644
Binary files a/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc and b/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc
index 15f6fca..ca08c23 100644
Binary files a/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/blocksparse_attn.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc
index 810cba2..0d31402 100644
Binary files a/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc b/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc
index efdfb57..212a05e 100644
Binary files a/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc and b/vllm/attention/backends/__pycache__/flashinfer.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc
index 9721229..1547d66 100644
Binary files a/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/ipex_attn.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc b/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc
index 2b35a08..c57a295 100644
Binary files a/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc and b/vllm/attention/backends/__pycache__/openvino.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc b/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc
index 08e0554..8d92265 100644
Binary files a/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc and b/vllm/attention/backends/__pycache__/pallas.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc
index 5ce3aa7..3a982f1 100644
Binary files a/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/placeholder_attn.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc b/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc
index d354027..bac8ec2 100644
Binary files a/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc and b/vllm/attention/backends/__pycache__/rocm_flash_attn.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc b/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc
index d0524c2..2647e0a 100644
Binary files a/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc and b/vllm/attention/backends/__pycache__/torch_sdpa.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/utils.cpython-310.pyc b/vllm/attention/backends/__pycache__/utils.cpython-310.pyc
index a1e7d37..0669a5f 100644
Binary files a/vllm/attention/backends/__pycache__/utils.cpython-310.pyc and b/vllm/attention/backends/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc b/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc
index 19ce106..e6bb887 100644
Binary files a/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc and b/vllm/attention/backends/__pycache__/xformers.cpython-310.pyc differ
diff --git a/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc b/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc
index 591955c..4c36349 100644
Binary files a/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc b/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc
index bb52c49..34922a6 100644
Binary files a/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc and b/vllm/attention/ops/__pycache__/ipex_attn.cpython-310.pyc differ
diff --git a/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc b/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc
index ddc8a86..5fe7b1e 100644
Binary files a/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc and b/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc differ
diff --git a/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc b/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc
index bb82b22..b31f3a1 100644
Binary files a/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc and b/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc differ
diff --git a/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc b/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc
index 33c30d5..f8bd2f5 100644
Binary files a/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc and b/vllm/attention/ops/__pycache__/triton_flash_attention.cpython-310.pyc differ
diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc
index ec2a31a..4231da0 100644
Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc
index 71fa12d..45bec7c 100644
Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/blocksparse_attention_kernel.cpython-310.pyc differ
diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc
index e01d624..825470f 100644
Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/interface.cpython-310.pyc differ
diff --git a/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc b/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc
index 306f346..3f9fde4 100644
Binary files a/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc and b/vllm/attention/ops/blocksparse_attention/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 1741dd1..c90e8dd 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -4,7 +4,10 @@ from typing import List, Optional, Tuple
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 9a39e2b..a2a649c 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -808,8 +808,6 @@ if triton.__version__ >= "2.1.0":
             )
             return
 
-        import time
-        ts_beg = time.time()
         _fwd_kernel[grid](
             q,
             k,
@@ -860,6 +858,4 @@ if triton.__version__ >= "2.1.0":
             num_warps=NUM_WARPS,
             num_stages=1,
         )
-        elapsed = time.time() - ts_beg
-        #print(f'{elapsed}: {BLOCK=}, {Lk=}, {Lk_padded=}, {BLOCK=}, {sliding_window=}, {NUM_WARPS=}')
         return
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index d1a0bd2..ae90e03 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -204,9 +204,6 @@ def which_attn_to_use(
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
         return _Backend.PALLAS
-    
-    if selected_backend == _Backend.FLASH_ATTN:
-        print("selected_backend == _Backend.FLASH_ATTN")
 
     if is_hip():
         # AMD GPUs.
diff --git a/vllm/compilation/__pycache__/__init__.cpython-310.pyc b/vllm/compilation/__pycache__/__init__.cpython-310.pyc
index 004a088..8f5fb17 100644
Binary files a/vllm/compilation/__pycache__/__init__.cpython-310.pyc and b/vllm/compilation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/compilation/__pycache__/backends.cpython-310.pyc b/vllm/compilation/__pycache__/backends.cpython-310.pyc
index 1b80e58..3cdd64f 100644
Binary files a/vllm/compilation/__pycache__/backends.cpython-310.pyc and b/vllm/compilation/__pycache__/backends.cpython-310.pyc differ
diff --git a/vllm/compilation/__pycache__/compile_context.cpython-310.pyc b/vllm/compilation/__pycache__/compile_context.cpython-310.pyc
index dc22671..237dede 100644
Binary files a/vllm/compilation/__pycache__/compile_context.cpython-310.pyc and b/vllm/compilation/__pycache__/compile_context.cpython-310.pyc differ
diff --git a/vllm/compilation/__pycache__/decorators.cpython-310.pyc b/vllm/compilation/__pycache__/decorators.cpython-310.pyc
index 8a478e6..822cc60 100644
Binary files a/vllm/compilation/__pycache__/decorators.cpython-310.pyc and b/vllm/compilation/__pycache__/decorators.cpython-310.pyc differ
diff --git a/vllm/compilation/__pycache__/levels.cpython-310.pyc b/vllm/compilation/__pycache__/levels.cpython-310.pyc
index 1a93d5b..50e2e9f 100644
Binary files a/vllm/compilation/__pycache__/levels.cpython-310.pyc and b/vllm/compilation/__pycache__/levels.cpython-310.pyc differ
diff --git a/vllm/compilation/__pycache__/wrapper.cpython-310.pyc b/vllm/compilation/__pycache__/wrapper.cpython-310.pyc
index 8eeb143..d02e998 100644
Binary files a/vllm/compilation/__pycache__/wrapper.cpython-310.pyc and b/vllm/compilation/__pycache__/wrapper.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/__init__.cpython-310.pyc b/vllm/core/__pycache__/__init__.cpython-310.pyc
index 9f6eb9f..41f5688 100644
Binary files a/vllm/core/__pycache__/__init__.cpython-310.pyc and b/vllm/core/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc b/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc
index 8c402a8..8c4fe9e 100644
Binary files a/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc and b/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc b/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc
index dd98dec..7d29f4e 100644
Binary files a/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc and b/vllm/core/__pycache__/block_manager_v2.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/evictor_v1.cpython-310.pyc b/vllm/core/__pycache__/evictor_v1.cpython-310.pyc
index c3ea008..2caaca7 100644
Binary files a/vllm/core/__pycache__/evictor_v1.cpython-310.pyc and b/vllm/core/__pycache__/evictor_v1.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/evictor_v2.cpython-310.pyc b/vllm/core/__pycache__/evictor_v2.cpython-310.pyc
index 6f0fe90..d0da42d 100644
Binary files a/vllm/core/__pycache__/evictor_v2.cpython-310.pyc and b/vllm/core/__pycache__/evictor_v2.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/interfaces.cpython-310.pyc b/vllm/core/__pycache__/interfaces.cpython-310.pyc
index 444f4a8..7fed706 100644
Binary files a/vllm/core/__pycache__/interfaces.cpython-310.pyc and b/vllm/core/__pycache__/interfaces.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc b/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc
index 34b1c47..923858d 100644
Binary files a/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc and b/vllm/core/__pycache__/placeholder_block_space_manager.cpython-310.pyc differ
diff --git a/vllm/core/__pycache__/scheduler.cpython-310.pyc b/vllm/core/__pycache__/scheduler.cpython-310.pyc
index ac0dc40..0f0d191 100644
Binary files a/vllm/core/__pycache__/scheduler.cpython-310.pyc and b/vllm/core/__pycache__/scheduler.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/__init__.cpython-310.pyc b/vllm/core/block/__pycache__/__init__.cpython-310.pyc
index c456286..34aafd7 100644
Binary files a/vllm/core/block/__pycache__/__init__.cpython-310.pyc and b/vllm/core/block/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/block_table.cpython-310.pyc b/vllm/core/block/__pycache__/block_table.cpython-310.pyc
index 5600c63..08a2cb6 100644
Binary files a/vllm/core/block/__pycache__/block_table.cpython-310.pyc and b/vllm/core/block/__pycache__/block_table.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/common.cpython-310.pyc b/vllm/core/block/__pycache__/common.cpython-310.pyc
index 5ac6490..61171ac 100644
Binary files a/vllm/core/block/__pycache__/common.cpython-310.pyc and b/vllm/core/block/__pycache__/common.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc b/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc
index b928f93..fe2ad47 100644
Binary files a/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc and b/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/interfaces.cpython-310.pyc b/vllm/core/block/__pycache__/interfaces.cpython-310.pyc
index 5b72ccf..209e865 100644
Binary files a/vllm/core/block/__pycache__/interfaces.cpython-310.pyc and b/vllm/core/block/__pycache__/interfaces.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/naive_block.cpython-310.pyc b/vllm/core/block/__pycache__/naive_block.cpython-310.pyc
index 859d45a..b6ab38d 100644
Binary files a/vllm/core/block/__pycache__/naive_block.cpython-310.pyc and b/vllm/core/block/__pycache__/naive_block.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc b/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc
index ccc5f07..ef02311 100644
Binary files a/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc and b/vllm/core/block/__pycache__/prefix_caching_block.cpython-310.pyc differ
diff --git a/vllm/core/block/__pycache__/utils.cpython-310.pyc b/vllm/core/block/__pycache__/utils.cpython-310.pyc
index c5f9386..bf15d4d 100644
Binary files a/vllm/core/block/__pycache__/utils.cpython-310.pyc and b/vllm/core/block/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/distributed/__pycache__/__init__.cpython-310.pyc b/vllm/distributed/__pycache__/__init__.cpython-310.pyc
index 96bce1f..912dfd7 100644
Binary files a/vllm/distributed/__pycache__/__init__.cpython-310.pyc and b/vllm/distributed/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/distributed/__pycache__/communication_op.cpython-310.pyc b/vllm/distributed/__pycache__/communication_op.cpython-310.pyc
index d3c14b3..52ccfdc 100644
Binary files a/vllm/distributed/__pycache__/communication_op.cpython-310.pyc and b/vllm/distributed/__pycache__/communication_op.cpython-310.pyc differ
diff --git a/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc b/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc
index d3779a1..f696dc6 100644
Binary files a/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc and b/vllm/distributed/__pycache__/parallel_state.cpython-310.pyc differ
diff --git a/vllm/distributed/__pycache__/utils.cpython-310.pyc b/vllm/distributed/__pycache__/utils.cpython-310.pyc
index 424b3ac..c4fac5b 100644
Binary files a/vllm/distributed/__pycache__/utils.cpython-310.pyc and b/vllm/distributed/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc
index 58d4309..2619583 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc
index f0c4225..b88d48c 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc
index 17f8c24..045bef6 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc
index e215e78..44034d5 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc
index 222c197..7018b7e 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/pynccl.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc
index 19d5c14..0563cd4 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc
index b63c55a..f2b9616 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/shm_broadcast.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc b/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc
index 7c438a6..bbe4348 100644
Binary files a/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc and b/vllm/distributed/device_communicators/__pycache__/tpu_communicator.cpython-310.pyc differ
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 7d526b2..c41a505 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -217,7 +217,8 @@ class MessageQueue:
             remote_subscribe_port = get_open_port()
             if is_valid_ipv6_address(connect_ip):
                 self.remote_socket.setsockopt(IPV6, 1)
-            socket_addr = f"tcp://*:{remote_subscribe_port}"
+                connect_ip = f"[{connect_ip}]"
+            socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
             self.remote_socket.bind(socket_addr)
 
         else:
diff --git a/vllm/engine/__pycache__/__init__.cpython-310.pyc b/vllm/engine/__pycache__/__init__.cpython-310.pyc
index 1312f8c..20b1de9 100644
Binary files a/vllm/engine/__pycache__/__init__.cpython-310.pyc and b/vllm/engine/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/arg_utils.cpython-310.pyc b/vllm/engine/__pycache__/arg_utils.cpython-310.pyc
index cd5d308..93db9cb 100644
Binary files a/vllm/engine/__pycache__/arg_utils.cpython-310.pyc and b/vllm/engine/__pycache__/arg_utils.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc b/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc
index 9d8f931..ecb32f4 100644
Binary files a/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc and b/vllm/engine/__pycache__/async_llm_engine.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/async_timeout.cpython-310.pyc b/vllm/engine/__pycache__/async_timeout.cpython-310.pyc
index 941df68..1f080ed 100644
Binary files a/vllm/engine/__pycache__/async_timeout.cpython-310.pyc and b/vllm/engine/__pycache__/async_timeout.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/llm_engine.cpython-310.pyc b/vllm/engine/__pycache__/llm_engine.cpython-310.pyc
index 38aed70..c106623 100644
Binary files a/vllm/engine/__pycache__/llm_engine.cpython-310.pyc and b/vllm/engine/__pycache__/llm_engine.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/metrics.cpython-310.pyc b/vllm/engine/__pycache__/metrics.cpython-310.pyc
index 0964a20..7067853 100644
Binary files a/vllm/engine/__pycache__/metrics.cpython-310.pyc and b/vllm/engine/__pycache__/metrics.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/metrics_types.cpython-310.pyc b/vllm/engine/__pycache__/metrics_types.cpython-310.pyc
index c952429..0ed046f 100644
Binary files a/vllm/engine/__pycache__/metrics_types.cpython-310.pyc and b/vllm/engine/__pycache__/metrics_types.cpython-310.pyc differ
diff --git a/vllm/engine/__pycache__/protocol.cpython-310.pyc b/vllm/engine/__pycache__/protocol.cpython-310.pyc
index e01213f..0e434b6 100644
Binary files a/vllm/engine/__pycache__/protocol.cpython-310.pyc and b/vllm/engine/__pycache__/protocol.cpython-310.pyc differ
diff --git a/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc b/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc
index 025eaf5..18aafdf 100644
Binary files a/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc and b/vllm/engine/multiprocessing/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc b/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc
index 8b83b75..a24654b 100644
Binary files a/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc and b/vllm/engine/multiprocessing/__pycache__/client.cpython-310.pyc differ
diff --git a/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc b/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc
index 2ebdd9c..52c5583 100644
Binary files a/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc and b/vllm/engine/multiprocessing/__pycache__/engine.cpython-310.pyc differ
diff --git a/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc
index 6280127..d16adb2 100644
Binary files a/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc
index ca6075f..13ebd7d 100644
Binary files a/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/interfaces.cpython-310.pyc differ
diff --git a/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc
index 5ab7771..8dc16c9 100644
Binary files a/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/multi_step.cpython-310.pyc differ
diff --git a/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc
index c447d33..6bbc943 100644
Binary files a/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/single_step.cpython-310.pyc differ
diff --git a/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc
index f26ea79..c778982 100644
Binary files a/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/stop_checker.cpython-310.pyc differ
diff --git a/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc b/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc
index 2918745..14ba774 100644
Binary files a/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc and b/vllm/engine/output_processor/__pycache__/util.cpython-310.pyc differ
diff --git a/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc
index 8efbea9..8ebcd41 100644
Binary files a/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc and b/vllm/entrypoints/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc b/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc
index 0704a52..48c4537 100644
Binary files a/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc and b/vllm/entrypoints/__pycache__/api_server.cpython-310.pyc differ
diff --git a/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc b/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc
index a1b01ce..37ddfb7 100644
Binary files a/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc and b/vllm/entrypoints/__pycache__/chat_utils.cpython-310.pyc differ
diff --git a/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc b/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc
index b8dfa82..b67935d 100644
Binary files a/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc and b/vllm/entrypoints/__pycache__/launcher.cpython-310.pyc differ
diff --git a/vllm/entrypoints/__pycache__/llm.cpython-310.pyc b/vllm/entrypoints/__pycache__/llm.cpython-310.pyc
index 0f22fd2..3f0bb4d 100644
Binary files a/vllm/entrypoints/__pycache__/llm.cpython-310.pyc and b/vllm/entrypoints/__pycache__/llm.cpython-310.pyc differ
diff --git a/vllm/entrypoints/__pycache__/logger.cpython-310.pyc b/vllm/entrypoints/__pycache__/logger.cpython-310.pyc
index 0df3711..e603f65 100644
Binary files a/vllm/entrypoints/__pycache__/logger.cpython-310.pyc and b/vllm/entrypoints/__pycache__/logger.cpython-310.pyc differ
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 41354dc..5195491 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -161,7 +161,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
-            if model_type == "qwen2_vl":
+            if model_type in ("qwen2_vl","qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "molmo":
                 return ""
@@ -172,7 +172,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                 return "<|reserved_special_token_0|>"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
-            if model_type == "qwen2_vl":
+            if model_type in ("qwen2_vl","qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             raise TypeError(f"Unknown model type: {model_type}")
         else:
diff --git a/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc
index 84bcaef..d5f09d2 100644
Binary files a/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc
index 59e7f05..6f95699 100644
Binary files a/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/api_server.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc
index d67a7cd..e8274a5 100644
Binary files a/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/cli_args.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc
index f28c2a5..45a3dbe 100644
Binary files a/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/logits_processors.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc
index ddf2de4..a2b1492 100644
Binary files a/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/protocol.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc
index 69572f5..acb744c 100644
Binary files a/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/run_batch.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc
index c9e2f78..ac60efe 100644
Binary files a/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_chat.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc
index 1928b66..4fb40e4 100644
Binary files a/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_completion.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc
index 32a12ca..32f8156 100644
Binary files a/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_embedding.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc
index 8920d02..f33f7a0 100644
Binary files a/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_engine.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc b/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc
index 258edf8..73a6915 100644
Binary files a/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc and b/vllm/entrypoints/openai/__pycache__/serving_tokenization.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ae44b26..ef279d6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -3,7 +3,7 @@ import importlib
 import inspect
 import multiprocessing
 import os
-import re
+import regex as re
 import signal
 import socket
 import tempfile
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc
index 79baa12..2b6e806 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc
index ae2f3f3..77c6782 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc
index 82faaba..f621c02 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc
index e4db40f..15c820d 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc
index ce23591..dc102bf 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc
index b3cdc69..c5f6714 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc b/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc
index 6d33704..31b9a81 100644
Binary files a/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc and b/vllm/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index bcbcda3..f3cc1d9 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,5 +1,5 @@
 import json
-import re
+import regex as re
 from typing import Dict, List, Sequence, Union
 
 import partial_json_parser
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 3cf34bc..f950ab1 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,5 +1,5 @@
 import json
-import re
+import regex as re
 from json import JSONDecodeError, JSONDecoder
 from typing import Dict, List, Sequence, Union
 
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index c6dc068..b7d5d0a 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,5 +1,5 @@
 import json
-import re
+import regex as re
 from random import choices
 from string import ascii_letters, digits
 from typing import Dict, List, Sequence, Union
diff --git a/vllm/envs.py b/vllm/envs.py
index 4c9b4ae..3361afb 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -66,6 +66,7 @@ if TYPE_CHECKING:
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False
     VLLM_TORCH_COMPILE_LEVEL: int = 0
+    VLLM_V0_USE_OUTLINES_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -430,6 +431,12 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
     lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
                            ) == "1",
+    
+    # Whether to turn on the outlines cache for V0
+    # This cache is unbounded and on disk, so it's not safe to use in
+    # an environment with potentially malicious users.
+    "VLLM_V0_USE_OUTLINES_CACHE":
+    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
 }
 
 # end-env-vars-definition
diff --git a/vllm/executor/__pycache__/__init__.cpython-310.pyc b/vllm/executor/__pycache__/__init__.cpython-310.pyc
index 06de5ea..a443572 100644
Binary files a/vllm/executor/__pycache__/__init__.cpython-310.pyc and b/vllm/executor/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc
index 0032ddf..6286a08 100644
Binary files a/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/cpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc
index ae7c76d..2a851d8 100644
Binary files a/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/distributed_gpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/executor_base.cpython-310.pyc b/vllm/executor/__pycache__/executor_base.cpython-310.pyc
index ccef798..6c483d2 100644
Binary files a/vllm/executor/__pycache__/executor_base.cpython-310.pyc and b/vllm/executor/__pycache__/executor_base.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc
index 42aa355..8d7cc79 100644
Binary files a/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/gpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc b/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc
index d824251..9dd4390 100644
Binary files a/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc and b/vllm/executor/__pycache__/msgspec_utils.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc
index 57a1ff8..a9f62cd 100644
Binary files a/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/multiproc_gpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc b/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc
index b092866..7d6a611 100644
Binary files a/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc and b/vllm/executor/__pycache__/multiproc_worker_utils.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc
index 4b31776..af0c391 100644
Binary files a/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/multiproc_xpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc b/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc
index 2b14ed3..778b237 100644
Binary files a/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc and b/vllm/executor/__pycache__/neuron_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc b/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc
index 79fbeb9..43c6b1f 100644
Binary files a/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc and b/vllm/executor/__pycache__/openvino_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc
index 2aaa6fe..e279213 100644
Binary files a/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/ray_gpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc
index 7ac3d79..eb316e4 100644
Binary files a/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/ray_tpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/ray_utils.cpython-310.pyc b/vllm/executor/__pycache__/ray_utils.cpython-310.pyc
index a509882..58e6d0d 100644
Binary files a/vllm/executor/__pycache__/ray_utils.cpython-310.pyc and b/vllm/executor/__pycache__/ray_utils.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc
index b78487c..686dd9b 100644
Binary files a/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/ray_xpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc
index 02db6b6..59a4791 100644
Binary files a/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/tpu_executor.cpython-310.pyc differ
diff --git a/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc b/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc
index 651acb1..bbd8076 100644
Binary files a/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc and b/vllm/executor/__pycache__/xpu_executor.cpython-310.pyc differ
diff --git a/vllm/inputs/__pycache__/__init__.cpython-310.pyc b/vllm/inputs/__pycache__/__init__.cpython-310.pyc
index 565b6e4..1ecbfd4 100644
Binary files a/vllm/inputs/__pycache__/__init__.cpython-310.pyc and b/vllm/inputs/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/inputs/__pycache__/data.cpython-310.pyc b/vllm/inputs/__pycache__/data.cpython-310.pyc
index b69bc51..437d732 100644
Binary files a/vllm/inputs/__pycache__/data.cpython-310.pyc and b/vllm/inputs/__pycache__/data.cpython-310.pyc differ
diff --git a/vllm/inputs/__pycache__/parse.cpython-310.pyc b/vllm/inputs/__pycache__/parse.cpython-310.pyc
index f11cc04..4664dc3 100644
Binary files a/vllm/inputs/__pycache__/parse.cpython-310.pyc and b/vllm/inputs/__pycache__/parse.cpython-310.pyc differ
diff --git a/vllm/inputs/__pycache__/preprocess.cpython-310.pyc b/vllm/inputs/__pycache__/preprocess.cpython-310.pyc
index 1857735..f0ab201 100644
Binary files a/vllm/inputs/__pycache__/preprocess.cpython-310.pyc and b/vllm/inputs/__pycache__/preprocess.cpython-310.pyc differ
diff --git a/vllm/inputs/__pycache__/registry.cpython-310.pyc b/vllm/inputs/__pycache__/registry.cpython-310.pyc
index c03e8fd..8eaef98 100644
Binary files a/vllm/inputs/__pycache__/registry.cpython-310.pyc and b/vllm/inputs/__pycache__/registry.cpython-310.pyc differ
diff --git a/vllm/logging/__pycache__/__init__.cpython-310.pyc b/vllm/logging/__pycache__/__init__.cpython-310.pyc
index f37e9d2..84691e3 100644
Binary files a/vllm/logging/__pycache__/__init__.cpython-310.pyc and b/vllm/logging/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/logging/__pycache__/formatter.cpython-310.pyc b/vllm/logging/__pycache__/formatter.cpython-310.pyc
index afb951e..ac687a9 100644
Binary files a/vllm/logging/__pycache__/formatter.cpython-310.pyc and b/vllm/logging/__pycache__/formatter.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/__init__.cpython-310.pyc b/vllm/lora/__pycache__/__init__.cpython-310.pyc
index 73ac80f..6a0040c 100644
Binary files a/vllm/lora/__pycache__/__init__.cpython-310.pyc and b/vllm/lora/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc b/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc
index 652738e..7e13e76 100644
Binary files a/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc and b/vllm/lora/__pycache__/fully_sharded_layers.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/layers.cpython-310.pyc b/vllm/lora/__pycache__/layers.cpython-310.pyc
index bcc2b16..7d1ee4c 100644
Binary files a/vllm/lora/__pycache__/layers.cpython-310.pyc and b/vllm/lora/__pycache__/layers.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/lora.cpython-310.pyc b/vllm/lora/__pycache__/lora.cpython-310.pyc
index 7a5f001..f0da988 100644
Binary files a/vllm/lora/__pycache__/lora.cpython-310.pyc and b/vllm/lora/__pycache__/lora.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/models.cpython-310.pyc b/vllm/lora/__pycache__/models.cpython-310.pyc
index 0b47ee7..63d5a53 100644
Binary files a/vllm/lora/__pycache__/models.cpython-310.pyc and b/vllm/lora/__pycache__/models.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/punica.cpython-310.pyc b/vllm/lora/__pycache__/punica.cpython-310.pyc
index 1b72815..aeedbe9 100644
Binary files a/vllm/lora/__pycache__/punica.cpython-310.pyc and b/vllm/lora/__pycache__/punica.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/request.cpython-310.pyc b/vllm/lora/__pycache__/request.cpython-310.pyc
index fba3fa2..0d466ee 100644
Binary files a/vllm/lora/__pycache__/request.cpython-310.pyc and b/vllm/lora/__pycache__/request.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/utils.cpython-310.pyc b/vllm/lora/__pycache__/utils.cpython-310.pyc
index cf125f8..76b259e 100644
Binary files a/vllm/lora/__pycache__/utils.cpython-310.pyc and b/vllm/lora/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/lora/__pycache__/worker_manager.cpython-310.pyc b/vllm/lora/__pycache__/worker_manager.cpython-310.pyc
index f99a173..68fe572 100644
Binary files a/vllm/lora/__pycache__/worker_manager.cpython-310.pyc and b/vllm/lora/__pycache__/worker_manager.cpython-310.pyc differ
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index aaadca9..8805d05 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -2,7 +2,7 @@ import copy
 import json
 import math
 import os
-import re
+import regex as re
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional, Type
 
@@ -263,7 +263,7 @@ class LoRAModel(AdapterModel):
                 new_embeddings_tensor_path)
         elif os.path.isfile(new_embeddings_bin_file_path):
             embeddings = torch.load(new_embeddings_bin_file_path,
-                                    map_location=device)
+                                    map_location=device, weights_only=True)
 
         rank = config["r"]
         lora_alpha = config["lora_alpha"]
diff --git a/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc b/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc
index 9ede199..85eb3c1 100644
Binary files a/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc and b/vllm/lora/ops/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc b/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc
index 217f22f..7e6e3cb 100644
Binary files a/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc and b/vllm/lora/ops/__pycache__/bgmv_expand.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc b/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc
index da16abe..c64e274 100644
Binary files a/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc and b/vllm/lora/ops/__pycache__/bgmv_expand_slice.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc b/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc
index 0f98091..3938bb8 100644
Binary files a/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc and b/vllm/lora/ops/__pycache__/bgmv_shrink.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc b/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc
index 08cfc9f..1e54c33 100644
Binary files a/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc and b/vllm/lora/ops/__pycache__/sgmv_expand.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc b/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc
index fdc90b3..51f1d6a 100644
Binary files a/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc and b/vllm/lora/ops/__pycache__/sgmv_expand_slice.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc b/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc
index e7f1bfa..00b0ff3 100644
Binary files a/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc and b/vllm/lora/ops/__pycache__/sgmv_shrink.cpython-310.pyc differ
diff --git a/vllm/lora/ops/__pycache__/utils.cpython-310.pyc b/vllm/lora/ops/__pycache__/utils.cpython-310.pyc
index dd89854..762bb68 100644
Binary files a/vllm/lora/ops/__pycache__/utils.cpython-310.pyc and b/vllm/lora/ops/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index a780429..066d94a 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,5 +1,5 @@
 import os
-import re
+import regex as re
 from typing import List, Optional, Set, Tuple, Type, Union
 
 import huggingface_hub
diff --git a/vllm/model_executor/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/__pycache__/__init__.cpython-310.pyc
index 3e896cf..984b8c1 100644
Binary files a/vllm/model_executor/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc b/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc
index 3548469..ddaac9a 100644
Binary files a/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc and b/vllm/model_executor/__pycache__/custom_op.cpython-310.pyc differ
diff --git a/vllm/model_executor/__pycache__/parameter.cpython-310.pyc b/vllm/model_executor/__pycache__/parameter.cpython-310.pyc
index fe4df37..78a8ef1 100644
Binary files a/vllm/model_executor/__pycache__/parameter.cpython-310.pyc and b/vllm/model_executor/__pycache__/parameter.cpython-310.pyc differ
diff --git a/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc b/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc
index 354b163..c03a4f1 100644
Binary files a/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc and b/vllm/model_executor/__pycache__/pooling_metadata.cpython-310.pyc differ
diff --git a/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc b/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc
index c02d291..d919ff2 100644
Binary files a/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc and b/vllm/model_executor/__pycache__/sampling_metadata.cpython-310.pyc differ
diff --git a/vllm/model_executor/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/__pycache__/utils.cpython-310.pyc
index 4036015..96650f9 100644
Binary files a/vllm/model_executor/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc
index 35cbb04..59e35da 100644
Binary files a/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc
index 8db14b7..fbc7392 100644
Binary files a/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/guided_fields.cpython-310.pyc differ
diff --git a/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc
index 071dcb0..04aec4d 100644
Binary files a/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/lm_format_enforcer_decoding.cpython-310.pyc differ
diff --git a/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc
index 8de5d11..da22e84 100644
Binary files a/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/outlines_decoding.cpython-310.pyc differ
diff --git a/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc b/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc
index 833a519..d6c6685 100644
Binary files a/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc and b/vllm/model_executor/guided_decoding/__pycache__/outlines_logits_processors.cpython-310.pyc differ
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index c28bd71..fb8db44 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -23,12 +23,21 @@ from typing import Callable, DefaultDict, Dict, List, Union
 import torch
 from lark import Lark
 from outlines import grammars
-from outlines.caching import cache
+from outlines.caching import cache, disable_cache
 from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
+import vllm.envs as envs
+from vllm.logger import init_logger
+logger = init_logger(__name__)
 
+if envs.VLLM_V0_USE_OUTLINES_CACHE:
+    logger.warning("Enabling outlines cache. This is an unbounded on-disk "
+                   "cache. It may consume a lot of disk space and should "
+                   "not be used with untrusted clients.")
+else:
+    disable_cache()
 
 class BaseLogitsProcessor:
 
diff --git a/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc
index 49d5dd0..7d3f145 100644
Binary files a/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc
index a6a2ab0..6d39d5d 100644
Binary files a/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc
index 0f09c8c..e77cc9c 100644
Binary files a/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc
index be35bf4..c1cc88f 100644
Binary files a/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc
index bf46f27..f996ecf 100644
Binary files a/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/logits_processor.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc
index a97b752..dbb2dae 100644
Binary files a/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/pooler.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc
index e99e319..c6362a4 100644
Binary files a/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/rejection_sampler.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc
index 7402f04..3807358 100644
Binary files a/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/resampler.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc
index 4e5f06b..6eecd9e 100644
Binary files a/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc
index 22d4863..4fa31a3 100644
Binary files a/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc
index 879c57f..ae776c2 100644
Binary files a/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/spec_decode_base_sampler.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc
index 6c3ed20..9315154 100644
Binary files a/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/typical_acceptance_sampler.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc b/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc
index 2c1fcda..321681b 100644
Binary files a/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc and b/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc
index 51e1059..2b6e09f 100644
Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc
index eea9303..e8c7e52 100644
Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc
index ed50e46..0fd8959 100644
Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc
index a3b03b7..9684cff 100644
Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/layer.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc b/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc
index a09e693..afcf218 100644
Binary files a/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc and b/vllm/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8ed74ef..789a77e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -132,7 +132,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if (x.shape[0] == 16384 or x.shape[0] == 15360):
+        if (x.shape[0] == 8192 or x.shape[0] == 16384 or x.shape[0] == 15360):
             if bias is None:
                 return x @ layer.weight.T
             else:
diff --git a/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc
index 888ccc9..bff33db 100644
Binary files a/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc
index a095ccb..db529d1 100644
Binary files a/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc b/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc
index 5b34390..a809675 100644
Binary files a/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc and b/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc b/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc
index 0613b1e..9df7c51 100644
Binary files a/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc and b/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc
index 41b1bb6..c2312d9 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc
index 970f5d3..97c6107 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/aqlm.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc
index c6291af..33b34e6 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/awq.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc
index 47ed177..bccc8e3 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc
index 7f1d394..35acfdb 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/awq_triton.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc
index 0759d0c..39e0f44 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/base_config.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc
index a4f4198..5e8362e 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc
index 6213ec2..a31daaa 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc
index ea4f3a1..8957b27 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/experts_int8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc
index ac659f7..54b49c9 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc
index e60b9d0..69d150f 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/fp8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc
index 63e7832..72e41da 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gguf.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc
index 7be428a..a9d75f1 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gptq.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc
index 3e2ccc9..d54eeef 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc
index 7041cbc..fbc58d2 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc
index f3d9348..4b11c2f 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/ipex_quant.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc
index e9a9e83..44e5b41 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/kv_cache.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc
index 326ec50..5f1f5fc 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/marlin.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc
index acafb63..87d5593 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/modelopt.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc
index 8e7cd18..540c07b 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/neuron_quant.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc
index 0993354..32780cf 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/qqq.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc
index b0cf65f..e1ee39c 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/schema.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc
index 39b8434..470be32 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/tpu_int8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc b/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc
index cc44e16..9a134f3 100644
Binary files a/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc and b/vllm/model_executor/layers/quantization/__pycache__/w8a16.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc
index 6452a20..a551083 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc
index d9325cc..db6f166 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc
index 73b4a52..c2bceb0 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc
index 1e28ce3..408ed02 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc
index b30bdb5..d24a3fc 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc
index 8bdd453..fe4692a 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc
index 4c5d1b6..258d504 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc
index 2f1b90c..a286914 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc
index be73cc2..9d03344 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc
index 8776b45..f2aa37c 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc
index 7fea985..c84fb93 100644
Binary files a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc and b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index fc531b9..785f7ff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,4 +1,4 @@
-import re
+import regex as re
 from enum import Enum
 from typing import Any, Dict, Iterable, Optional, Union
 
diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc
index caa2d19..6690d8b 100644
Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/MPLinearKernel.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc
index 72e7450..f2827fc 100644
Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc
index 6a67b05..6c0760b 100644
Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/machete.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc b/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc
index 478759a..b226a8e 100644
Binary files a/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc and b/vllm/model_executor/layers/quantization/kernels/__pycache__/marlin.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc
index 283737e..ea126a2 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc
index 9c7dfc0..7a710e9 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc
index 9ec857a..1d23696 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc
index 338b186..8c4e2f5 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc
index cc9b2e8..fbd93ab 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc
index f8cb1bc..c5929a4 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc
index 16f3a63..26930bc 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc
index 119fea0..3bf1f4b 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc
index 076fe51..5379065 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc b/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc
index 14fcc99..e32cfbf 100644
Binary files a/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc and b/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc
index 3a5a9b4..93f94a4 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc
index 82e3897..50280b7 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/loader.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc
index c6314a9..4d66454 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/neuron.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc
index d2e597f..f256d7f 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/openvino.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc
index 3bd15bb..8993fb1 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/tensorizer.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc
index 059d9cd..33127c2 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc b/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc
index ccaa569..a2a2667 100644
Binary files a/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc and b/vllm/model_executor/model_loader/__pycache__/weight_utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 36f33d6..da224ec 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -2,7 +2,7 @@ import argparse
 import dataclasses
 import io
 import os
-import re
+import regex as re
 import time
 from dataclasses import dataclass
 from functools import partial
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 1e2857e..746b0be 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -85,7 +85,7 @@ def convert_bin_to_safetensor_file(
     pt_filename: str,
     sf_filename: str,
 ) -> None:
-    loaded = torch.load(pt_filename, map_location="cpu")
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
     if "state_dict" in loaded:
         loaded = loaded["state_dict"]
     shared = _shared_pointers(loaded)
@@ -373,7 +373,7 @@ def np_cache_weights_iterator(
                     disable=not enable_tqdm,
                     bar_format=_BAR_FORMAT,
             ):
-                state = torch.load(bin_file, map_location="cpu")
+                state = torch.load(bin_file, map_location="cpu", weights_only=True)
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
                     with open(param_path, "wb") as f:
@@ -422,7 +422,7 @@ def pt_weights_iterator(
             disable=not enable_tqdm,
             bar_format=_BAR_FORMAT,
     ):
-        state = torch.load(bin_file, map_location="cpu")
+        state = torch.load(bin_file, map_location="cpu", weights_only=True)
         for name, param in state.items():
             yield name, param
         del state
diff --git a/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc b/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc
index 5ef0ec3..d17d8c8 100644
Binary files a/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc b/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc
index 7eac109..3dd3e7c 100644
Binary files a/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/arctic.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc b/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc
index 7318751..b43274e 100644
Binary files a/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/baichuan.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc b/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc
index 4c44deb..363f805 100644
Binary files a/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/bart.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc b/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc
index d540977..f27531a 100644
Binary files a/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/blip.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc
index 382fc21..ff0f67f 100644
Binary files a/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/blip2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc b/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc
index 81e0e5e..e2bdd4b 100644
Binary files a/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/bloom.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc
index 8f0dc82..4d355f0 100644
Binary files a/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/chameleon.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc
index dadaa45..5a0e6b0 100644
Binary files a/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/chatglm.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc b/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc
index 7dccad7..8d371d8 100644
Binary files a/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/clip.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc b/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc
index cdfcd5d..91e8bc6 100644
Binary files a/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/commandr.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc b/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc
index ddb1e64..2426736 100644
Binary files a/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/dbrx.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc
index 9479d4e..30d7d4c 100644
Binary files a/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/decilm.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc b/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc
index 4089242..4533427 100644
Binary files a/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/deepseek.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc
index cde5da2..bb2b8a7 100644
Binary files a/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/deepseek_v2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc b/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc
index 52cb839..9aedd9d 100644
Binary files a/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/eagle.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc b/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc
index 48f287a..3d88bd0 100644
Binary files a/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/exaone.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc
index 90692d5..bba13aa 100644
Binary files a/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/falcon.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc b/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc
index 81ff33d..03c9d37 100644
Binary files a/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/fuyu.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc
index e2608e5..31ad8d3 100644
Binary files a/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gemma.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc
index 6b8f9e3..eba3d7d 100644
Binary files a/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gemma2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc
index ce06079..601e6c3 100644
Binary files a/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gemma2_embedding.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc b/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc
index b6c8e16..4d465ee 100644
Binary files a/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/glm4.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc b/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc
index 1efd14e..2025bae 100644
Binary files a/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/glm4_vision_encoder.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc
index 3cc35e5..39602e6 100644
Binary files a/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc
index 97c4c4c..6fa561a 100644
Binary files a/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt_bigcode.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc
index 2025e1d..1375c93 100644
Binary files a/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt_j.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc b/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc
index f551e48..a0a3d4a 100644
Binary files a/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/gpt_neox.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc b/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc
index 2117e32..c48c518 100644
Binary files a/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/granite.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc
index 915ea81..92d5160 100644
Binary files a/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/granitemoe.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc b/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc
index c0eb9ce..ddba331 100644
Binary files a/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/idefics2_vision_model.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc b/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc
index bfc0a43..54a8e4b 100644
Binary files a/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/interfaces.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc b/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc
index bd8289c..30730cf 100644
Binary files a/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/interfaces_base.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc b/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc
index 2c2fe53..f6c30cc 100644
Binary files a/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/intern_vit.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc
index cdf8002..e2ac992 100644
Binary files a/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/internlm2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc b/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc
index c2207f6..72b1906 100644
Binary files a/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/internvl.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc b/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc
index 68b8894..b727dd6 100644
Binary files a/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/jais.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc b/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc
index 088a165..753a1d6 100644
Binary files a/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/jamba.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc
index 09e72b5..c2c4cd6 100644
Binary files a/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llama.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc
index be7ed35..6ddb654 100644
Binary files a/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llama_embedding.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc
index b0e994d..b0a04c7 100644
Binary files a/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc
index 491a949..fb98303 100644
Binary files a/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava_next.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc
index ba9e031..e747e99 100644
Binary files a/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava_next_video.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc b/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc
index 3a3049e..a27059f 100644
Binary files a/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/llava_onevision.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc
index 32a4981..3cba92a 100644
Binary files a/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mamba.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc
index 9ea4aa7..b677ee7 100644
Binary files a/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mamba_cache.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc b/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc
index 1576df2..ab11d57 100644
Binary files a/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/medusa.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc
index f028c81..6695051 100644
Binary files a/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/minicpm.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc b/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc
index 24638a6..9afdf52 100644
Binary files a/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/minicpm3.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc b/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc
index 7db5ee3..7bafcf1 100644
Binary files a/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/minicpmv.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc
index ddfa63f..ee61e0b 100644
Binary files a/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mixtral.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc
index f1fc24f..eaf2ba1 100644
Binary files a/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mixtral_quant.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc
index f41d430..8738631 100644
Binary files a/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mllama.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc
index 527a4fd..249a005 100644
Binary files a/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mlp_speculator.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc b/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc
index bf4fb66..bb01564 100644
Binary files a/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/module_mapping.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc b/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc
index 6619caf..e5dbd8b 100644
Binary files a/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/molmo.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc b/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc
index 7238d6f..cb952cb 100644
Binary files a/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/mpt.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc b/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc
index 54a91ea..0d8568c 100644
Binary files a/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/nemotron.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc b/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc
index 829adaf..7d71113 100644
Binary files a/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/nvlm_d.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc b/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc
index db57da9..8d0aa1f 100644
Binary files a/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/olmo.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc
index d428615..78f03c7 100644
Binary files a/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/olmoe.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc b/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc
index 1998f13..d13c4cc 100644
Binary files a/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/opt.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc b/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc
index 3e1747f..b6cf059 100644
Binary files a/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/orion.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc b/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc
index 06cef71..c87e787 100644
Binary files a/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/paligemma.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc b/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc
index 7102bb8..3a1f833 100644
Binary files a/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/persimmon.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc
index e081a8c..150fbf8 100644
Binary files a/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc
index 55c4858..ee65fa5 100644
Binary files a/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi3.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc
index c1d83d4..4e9d52a 100644
Binary files a/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi3_small.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc
index daff7ba..359db8a 100644
Binary files a/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phi3v.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc
index 9c26fe1..77f5a83 100644
Binary files a/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/phimoe.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc b/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc
index 2c1458f..7b4d3c7 100644
Binary files a/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/pixtral.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc
index d032ca4..339947a 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc
index 06ac033..6c49490 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen2_5_vl.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_5_vl.cpython-310.pyc
new file mode 100644
index 0000000..e96226c
Binary files /dev/null and b/vllm/model_executor/models/__pycache__/qwen2_5_vl.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc
index 6c33bbe..b62b317 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2_moe.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc
index 8b2d8b0..01486a9 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2_rm.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc
index de7b1b9..21a2a47 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen2_vl.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc
index 11f7142..58765bc 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen3.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc b/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc
index 6aed6f1..61a305e 100644
Binary files a/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/qwen3_moe.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc b/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc
index a3999c8..576a538 100644
Binary files a/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/registry.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc b/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc
index 186536a..b3c6fe0 100644
Binary files a/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/siglip.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc b/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc
index bde0046..150fd0b 100644
Binary files a/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/solar.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc b/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc
index ba2924f..c4d0eb0 100644
Binary files a/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/stablelm.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc b/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc
index d16115e..f9accf5 100644
Binary files a/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/starcoder2.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc b/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc
index 4c598e6..199e76a 100644
Binary files a/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/ultravox.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc b/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc
index b94a9ba..1f22f8d 100644
Binary files a/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc b/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc
index 71a46f3..e621a3c 100644
Binary files a/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc and b/vllm/model_executor/models/__pycache__/xverse.cpython-310.pyc differ
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 9024831..203badb 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -4,7 +4,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-import re
+import regex as re
 from functools import cached_property, partial
 from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 9ee4dd0..0236fc4 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
-import re
+import regex as re
 from functools import partial
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
                     Tuple, TypedDict, Union)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ccfee16..2ff4e04 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,6 +1,6 @@
 import logging
 import math
-import re
+import regex as re
 from array import array
 from dataclasses import dataclass
 from functools import lru_cache, partial
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 00a04da..523375f 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
-import re
+import regex as re
 from functools import cached_property, lru_cache
 from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
                     Tuple, TypedDict, Union)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index fd8a27e..3c94173 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -6,7 +6,7 @@
 """Inference-only QWen model compatible with HuggingFace weights."""
 
 import math
-import re
+import regex as re
 from functools import partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
                     Optional, Tuple, TypedDict, Union)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
new file mode 100644
index 0000000..bae5e54
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -0,0 +1,1213 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
+from functools import lru_cache, partial
+from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
+                    Tuple, Type, TypedDict, Union)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from PIL import Image
+from transformers.image_utils import (get_image_size,
+                                      infer_channel_dimension_format,
+                                      to_numpy_array)
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import (make_batched_videos, smart_resize)
+from transformers.models.emu3.image_processing_emu3 import make_batched_images
+
+import vllm.envs as envs
+from vllm.attention import AttentionMetadata
+from vllm.attention.selector import (_Backend, backend_name_to_enum,
+                                     get_global_forced_attn_backend)
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.distributed import get_pp_group, parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import utils as dist_utils
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalInputs)
+from vllm.multimodal.base import MultiModalData
+from vllm.multimodal.image import cached_get_image_processor
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors, SequenceData
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
+from vllm.transformers_utils.processor import get_processor
+from vllm.utils import is_cpu
+
+from .interfaces import SupportsMultiModal, SupportsPP
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory)
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class Qwen2_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: 
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2_5_VLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
+                           Qwen2_5_VLImageEmbeddingInputs]
+
+
+class Qwen2_5_VLVideoInputs(TypedDict):
+    pixel_values_videos: torch.Tensor
+    """Shape: 
+    `(num_patches, 
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+# === Vision Encoder === #
+
+
+class Qwen2_5_VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        bias: bool = False,
+        act_layer: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.gate_proj = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=bias,
+                                              quant_config=quant_config)
+        self.up_proj = ColumnParallelLinear(in_features,
+                                            hidden_features,
+                                            bias=bias,
+                                            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config)
+        self.act = act_layer
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_gate, _ = self.gate_proj(x)
+        x_gate = self.act(x_gate)
+        x_up, _ = self.up_proj(x)
+        x_down, _ = self.down_proj(x_gate * x_up)
+        return x_down
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x: torch.Tensor,
+                           cos: torch.Tensor,
+                           sin: torch.Tensor,
+                           interleaved: bool = False) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor,
+                                freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
+    return output
+
+
+class Qwen2_5_VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: Optional[int] = None,
+        num_heads: Optional[int] = None,
+        projection_size: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = ColumnParallelLinear(input_size=embed_dim,
+                                        output_size=3 * projection_size,
+                                        quant_config=quant_config)
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config)
+
+        # Detect attention implementation.
+        selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+        if selected_backend is None:
+            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+            if backend_by_env_var is not None:
+                selected_backend = backend_name_to_enum(backend_by_env_var)
+        if selected_backend is None:
+            # For Volta and Turing GPUs, use xformers instead.
+            device_available = current_platform.has_device_capability(80)
+            if device_available:
+                from transformers.utils import is_flash_attn_2_available
+
+                if is_flash_attn_2_available():
+                    self._use_flash_attn = True
+                else:
+                    logger.warning(
+                        "Current Qwen2-VL implementation has a bug with "
+                        "`vllm-flash-attn` inside vision module, so we use "
+                        "xformers backend instead. You can run `pip install "
+                        "flash-attn to use flash-attention backend.")
+                    self._use_flash_attn = False
+            else:
+                self._use_flash_attn = False
+        else:
+            if selected_backend == _Backend.FLASH_ATTN:
+                self._use_flash_attn = True
+            elif selected_backend == _Backend.XFORMERS:
+                self._use_flash_attn = False
+            else:
+                raise RuntimeError(
+                    f"Qwen2-5-VL does not support {selected_backend} backend now."
+                )
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = tensor_model_parallel_all_gather(qkv)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+    
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = [
+            rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+        ]
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        from ixformer.contrib.xformers import ops as xops
+        from xformers.ops.fmha.attn_bias import (AttentionBias,
+                                                BlockDiagonalMask,)
+
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                    kv_seqlen=None)
+        context_layer = xops.memory_efficient_attention_forward(
+            q, k, v, attn_bias=attn_bias, p=0, scale=None, op=xops.fmha.flash.FwOp())
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Qwen2_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_layer: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        
+
+        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
+                                         num_heads=num_heads,
+                                         projection_size=dim,
+                                         quant_config=quant_config)
+        self.mlp = Qwen2_5_VisionMLP(in_features=dim,
+                                  hidden_features=mlp_hidden_dim,
+                                  bias=True,
+                                  act_layer=act_layer,
+                                  quant_config=quant_config)
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
+                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x),
+                          cu_seqlens=cu_seqlens,
+                          rotary_pos_emb=rotary_pos_emb)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_chans: int = 3,
+        hidden_size: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_chans,
+                              hidden_size,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Qwen2_5_VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Type[nn.Module] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList([
+            ColumnParallelLinear(self.hidden_size,
+                                 self.hidden_size,
+                                 bias=True,
+                                 quant_config=quant_config),
+            nn.GELU(),
+            RowParallelLinear(self.hidden_size,
+                              d_model,
+                              bias=True,
+                              quant_config=quant_config),
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
+                                                / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2_5_VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        temporal_patch_size: int = vision_config.temporal_patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        in_channels: int = vision_config.in_channels
+        hidden_size: int = vision_config.hidden_size
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        self.spatial_merge_size = spatial_merge_size
+        self.window_size = vision_config.window_size
+        self.patch_size = patch_size
+        self.spatial_merge_unit = self.spatial_merge_size**2
+        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
+
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_chans=in_channels,
+            hidden_size=hidden_size,
+        )
+        norm_layer = partial(RMSNorm, eps=norm_eps)
+        head_dim = hidden_size // num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Qwen2_5_VisionBlock(
+                dim=hidden_size,
+                num_heads=num_heads,
+                mlp_hidden_dim=vision_config.intermediate_size,
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+            ) for layer_idx in range(depth)
+        ])
+        self.merger = Qwen2_5_VisionPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=self.spatial_merge_size,
+            quant_config=quant_config,
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100)
+            index_padded = index_padded.reshape(grid_t, num_windows_h,
+                                                vit_merger_window_size,
+                                                num_windows_w,
+                                                vit_merger_window_size)
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t, num_windows_h * num_windows_w, vit_merger_window_size,
+                vit_merger_window_size)
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        # windows attention
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        hidden_states = hidden_states.unsqueeze(1)
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            hidden_states = blk(hidden_states,
+                                cu_seqlens=cu_seqlens_now,
+                                rotary_pos_emb=rotary_pos_emb)
+
+        # adapter
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+
+
+# === Vision input helpers === #
+
+cached_get_processor = lru_cache(get_processor)
+
+
+def mm_input_mapper_for_qwen2_5_vl(
+    ctx: InputContext,
+    data: MultiModalData[object],
+    data_type_key: str,
+) -> MultiModalInputs:
+    """Input mapper for Qwen2-VL."""
+    if data_type_key == "image" and isinstance(data, dict):
+        return MultiModalInputs({
+            "image_embeds": data.get("image_embeds"),
+            "image_grid_thw": data.get("image_grid_thw"),
+        })
+    model_config = ctx.model_config
+    image_processor = cached_get_image_processor(
+        model_config.model, trust_remote_code=model_config.trust_remote_code)
+    if image_processor is None:
+        raise RuntimeError("No HuggingFace processor is available "
+                           "to process the image object")
+
+    images = None
+    videos = None
+    if data_type_key == "image":
+        images = data
+    else:
+        assert data_type_key == "video"
+        videos = data
+
+    try:
+        batch_data = image_processor \
+            .preprocess(images=images, videos=videos, return_tensors="pt") \
+            .data
+    except Exception:
+        logger.error("Failed to process image (%s)", data)
+        raise
+
+    return MultiModalInputs(batch_data)
+
+
+image_input_mapper_for_qwen2_5_vl = partial(mm_input_mapper_for_qwen2_5_vl,
+                                          data_type_key="image")
+video_input_mapper_for_qwen2_5_vl = partial(mm_input_mapper_for_qwen2_5_vl,
+                                          data_type_key="video")
+
+
+def _get_vision_info(
+    image_processor,
+    height: int,
+    width: int,
+    min_pixels: int,
+    max_pixels: int,
+    do_resize: bool = True,
+    data_type_key: str = "image",
+    mm_count: int = 1,
+):
+    """Get information (resized height / width and number of vision tokens)
+    of input image / video frame."""
+
+    if do_resize:
+        resized_height, resized_width = smart_resize(
+            height=height,
+            width=width,
+            factor=image_processor.patch_size * image_processor.merge_size,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    else:
+        resized_height, resized_width = height, width
+
+    if data_type_key == "image":
+        grid_t = mm_count
+    else:
+        assert data_type_key == "video"
+        grid_t = max(mm_count // image_processor.temporal_patch_size, 1)
+
+    grid_h = resized_height // image_processor.patch_size
+    grid_w = resized_width // image_processor.patch_size
+    vision_tokens = grid_t * grid_h * grid_w
+    llm_num_vision_tokens = (vision_tokens // image_processor.merge_size //
+                             image_processor.merge_size)
+
+    return resized_height, resized_width, llm_num_vision_tokens
+
+
+def _get_max_image_info(
+    image_processor,
+    data_type_key: str = "image",
+    mm_count: int = 1,
+):
+    return _get_vision_info(
+        image_processor,
+        height=9999999,
+        width=9999999,
+
+        # Limit min / max pixels.
+        min_pixels=max(image_processor.min_pixels, 28 * 28),
+        max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28),
+        data_type_key=data_type_key,
+        mm_count=mm_count,
+    )
+
+
+def get_max_qwen2_5_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int:
+    image_processor = cached_get_image_processor(ctx.model_config.model)
+    max_resized_height, max_resized_width, max_llm_image_tokens = \
+        _get_max_image_info(image_processor, data_type_key=data_type_key,
+                            mm_count=1)
+    return max_llm_image_tokens
+
+
+get_max_qwen2_5_vl_image_tokens = partial(get_max_qwen2_5_vl_mm_tokens,
+                                        data_type_key="image")
+get_max_qwen2_5_vl_video_tokens = partial(get_max_qwen2_5_vl_mm_tokens,
+                                        data_type_key="video")
+
+
+def dummy_data_for_qwen2_5_vl(
+    ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]
+) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
+    image_processor = cached_get_image_processor(ctx.model_config.model)
+
+    num_images = mm_counts["image"]
+    max_resized_height, max_resized_width, max_llm_image_tokens = \
+        _get_max_image_info(image_processor, data_type_key="image",
+                            mm_count=num_images)
+    if seq_len - max_llm_image_tokens - 2 < 0:
+        raise RuntimeError(
+            f"Qwen2-VL cannot process {num_images} images in a prompt, "
+            "please increase max_model_len or reduce image limit by "
+            "--limit-mm-per-prompt.")
+
+    # Check video counts.
+    num_videos = mm_counts["video"]
+    max_resized_height, max_resized_width, max_llm_video_tokens = \
+        _get_max_image_info(image_processor, data_type_key="video",
+                            mm_count=num_videos)
+    if seq_len - max_llm_video_tokens - 2 < 0:
+        raise RuntimeError(
+            f"Qwen2-VL cannot process {num_images} videos in a prompt, "
+            "please increase max_model_len or reduce video limit by "
+            "--limit-mm-per-prompt.")
+
+    hf_config = ctx.get_hf_config(Qwen2_5_VLConfig)
+
+    dummy_seqdata = SequenceData.from_token_counts(
+        (hf_config.vision_start_token_id, 1),
+        (hf_config.image_token_id, max_llm_image_tokens),
+        (hf_config.vision_end_token_id, 1),
+        (0, seq_len - max_llm_image_tokens - 2),
+    )
+
+    dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
+                            color=0)
+
+    return dummy_seqdata, {
+        "image": dummy_image if num_images == 1 else [dummy_image] * num_images
+    }
+
+
+def _get_llm_num_vision_tokens(
+    mm_inputs: list,
+    data_type_key: str,
+    image_processor,
+):
+    """Get number of vision tokens of multimodal inputs.
+
+    This method is derived from `transformers.models.qwen2_vl.
+    image_processing_qwen2_vl.Qwen2_5_VLImageProcessor._preprocess`.
+    """
+    image = to_numpy_array(mm_inputs[0])
+    input_data_format = infer_channel_dimension_format(image)
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    _, _, llm_num_vision_tokens = _get_vision_info(
+        image_processor,
+        height=height,
+        width=width,
+        min_pixels=image_processor.min_pixels,
+        max_pixels=image_processor.max_pixels,
+        do_resize=image_processor.do_resize,
+        data_type_key=data_type_key,
+        mm_count=len(mm_inputs),
+    )
+    return llm_num_vision_tokens
+
+
+def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable,
+                       data_type_key: str, image_processor: Any,
+                       prompt_token_ids: List[int]) -> List[int]:
+    """
+    Expand pad tokens for multi-modal inputs (e.g., images or videos).
+
+    Args:
+        inputs (list): The multi-modal inputs (e.g., images or videos).
+        token_id (int): The token ID used to represent the multi-modal input.
+        make_batched_fn (Callable): A function to batch the inputs.
+        data_type_key (str): The type of the multi-modal input.
+        image_processor (Any): The image processor used to process the inputs.
+        prompt_token_ids (List[int]): The list of token IDs in the prompt.
+
+    Returns:
+        List[int]: The list of token IDs for the multi-modal inputs.
+    """
+    indices = [
+        idx for idx, token in enumerate(prompt_token_ids) if token == token_id
+    ]
+    inputs = make_batched_fn(inputs)
+    assert len(indices) == len(inputs)
+
+    prompt_token_ids_with_data = []
+    for cnt, data in enumerate(inputs):
+        num_tokens = _get_llm_num_vision_tokens(
+            [data] if data_type_key == "image" else data,
+            data_type_key=data_type_key,
+            image_processor=image_processor,
+        )
+        if cnt == 0:
+            end_idx = indices[cnt]
+            non_data_tokens = prompt_token_ids[:end_idx]
+        else:
+            non_data_tokens = prompt_token_ids[indices[cnt - 1] +
+                                               1:indices[cnt]]
+        prompt_token_ids_with_data.extend(non_data_tokens)
+        prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens))
+    prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:])
+    return prompt_token_ids_with_data
+
+
+def input_processor_for_qwen2_5_vl(ctx: InputContext,
+                                 llm_inputs: LLMInputs) -> LLMInputs:
+    multi_modal_data = llm_inputs.get("multi_modal_data", None)
+    if multi_modal_data is None:
+        return llm_inputs
+
+    image_inputs = multi_modal_data.get("image", None)
+    video_inputs = multi_modal_data.get("video", None)
+
+    processor = cached_get_processor(ctx.model_config.model)
+    image_processor = processor.image_processor
+    hf_config = ctx.get_hf_config(Qwen2_5_VLConfig)
+
+    # To avoid redundant processing of vision objects (resize, rescale, etc.),
+    # we extract code of calculating number of vision tokens from
+    # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2_5_VLProcessor`.
+    #
+    # The following code is equivalent to:
+    #    prompt = llm_inputs["prompt"]
+    #    inputs = processor(text=[prompt],
+    #                       images=image_inputs,
+    #                       videos=video_inputs,
+    #                       padding=True,
+    #                       return_tensors="pt")
+    #    prompt_token_ids = inputs["input_ids"][0].tolist()
+
+    prompt_token_ids = llm_inputs.get("prompt_token_ids", None)
+    if prompt_token_ids is None:
+        prompt = llm_inputs["prompt"]
+        prompt_token_ids = processor.tokenizer(
+            prompt,
+            padding=True,
+            return_tensors=None,
+        )["input_ids"]
+
+    # Expand image pad tokens.
+
+    if image_inputs is not None:
+        if isinstance(image_inputs, dict):
+            prompt_token_ids_with_image = []
+            image_indices = [
+                idx for idx, token in enumerate(prompt_token_ids)
+                if token == hf_config.image_token_id
+            ]
+            image_cnt = len(image_indices)
+            embed_dim = image_inputs.get('image_embeds').size(0)
+            assert embed_dim % image_cnt == 0
+            num_pad_tokens = embed_dim // image_cnt
+            for idx, token in enumerate(prompt_token_ids):
+                if idx in image_indices:
+                    prompt_token_ids_with_image.extend([token] *
+                                                       num_pad_tokens)
+                else:
+                    prompt_token_ids_with_image.append(token)
+            prompt_token_ids = prompt_token_ids_with_image
+        else:
+            prompt_token_ids = _expand_pad_tokens(image_inputs,
+                                                  hf_config.image_token_id,
+                                                  make_batched_images, "image",
+                                                  image_processor,
+                                                  prompt_token_ids)
+
+    if video_inputs is not None:
+        prompt_token_ids = _expand_pad_tokens(video_inputs,
+                                              hf_config.video_token_id,
+                                              make_batched_videos, "video",
+                                              image_processor,
+                                              prompt_token_ids)
+
+    return LLMInputs(
+        prompt_token_ids=prompt_token_ids,
+        prompt=llm_inputs["prompt"],
+        multi_modal_data=multi_modal_data,
+    )
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(
+    image_input_mapper_for_qwen2_5_vl)
+@MULTIMODAL_REGISTRY.register_input_mapper("video",
+                                           video_input_mapper_for_qwen2_5_vl)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_5_vl_image_tokens)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "video", get_max_qwen2_5_vl_video_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_5_vl)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_5_vl)
+class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
+
+    def __init__(self,
+                 config: Qwen2_5_VLConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        assert not cache_config.enable_prefix_caching, \
+            "Qwen2-VL currently does not support prefix caching"
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Qwen2_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+
+            # NOTE: Qwen2-5-VL vision encoder does not support any
+            # quantization method now.
+            quant_config=None,
+        )
+
+        self.model = Qwen2Model(config, cache_config, quant_config)
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def _validate_and_reshape_mm_tensor(self,
+                                        mm_input: Union[torch.Tensor,
+                                                        List[torch.Tensor]],
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim}")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                           data=pixel_values,
+                                           image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(type="image_embeds",
+                                               data=image_embeds)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None:
+            return None
+
+        pixel_values_videos = self._validate_and_reshape_mm_tensor(
+            pixel_values_videos, "video pixel values")
+        video_grid_thw = self._validate_and_reshape_mm_tensor(
+            video_grid_thw, "video grid_thw")
+
+        return Qwen2_5_VLVideoInputs(
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+        )
+
+    def _process_image_input(self,
+                             image_input: Qwen2_5_VLImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"].type(self.visual.dtype)
+
+        pixel_values = image_input["data"].type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values,
+                                   grid_thw=image_input["image_grid_thw"])
+        return image_embeds
+
+    def _process_video_input(self,
+                             video_input: Qwen2_5_VLVideoInputs) -> torch.Tensor:
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos,
+                                   grid_thw=video_input["video_grid_thw"])
+        return video_embeds
+
+    def _merge_multimodal_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        multimodal_embeddings: torch.Tensor,
+        placeholder_token_id: int,
+    ) -> torch.Tensor:
+        mask = (input_ids == placeholder_token_id)
+        inputs_embeds[mask, :] = multimodal_embeddings
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+        """
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+        else:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                rope_scaling = getattr(self.config, "rope_scaling", {})
+                if rope_scaling.get("type", None) == "mrope":
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+
+                inputs_embeds = self.model.embed_tokens(input_ids)
+
+                if image_input is not None:
+                    image_embeds = self._process_image_input(image_input)
+                    inputs_embeds = self._merge_multimodal_embeddings(
+                        input_ids,
+                        inputs_embeds,
+                        image_embeds,
+                        placeholder_token_id=self.config.image_token_id,
+                    )
+
+                if video_input is not None:
+                    video_embeds = self._process_video_input(video_input)
+                    inputs_embeds = self._merge_multimodal_embeddings(
+                        input_ids,
+                        inputs_embeds,
+                        video_embeds,
+                        placeholder_token_id=self.config.video_token_id,
+                    )
+
+                input_ids = None
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            is_visual_gate_up = "visual" in name and ("gate_proj" in name or "up_proj" in name)
+            if is_visual_gate_up:
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    raise ValueError(f"Unexpected weight: {name}") from None
+
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 87a08b2..3f63c5f 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -136,11 +136,11 @@ class Qwen3Attention(nn.Module):
         # Add qk-norm
         q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
                            self.head_dim)
-        q_by_head = self.q_norm.forward_native(q_by_head)
+        q_by_head = self.q_norm.forward_cuda(q_by_head.contiguous())
         q = q_by_head.view(q.shape)
         k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
                            self.head_dim)
-        k_by_head = self.k_norm.forward_native(k_by_head)
+        k_by_head = self.k_norm.forward_cuda(k_by_head.contiguous())
         k = k_by_head.view(k.shape)
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 73ae906..a2d063c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -114,6 +114,7 @@ _MULTIMODAL_MODELS = {
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/multimodal/__pycache__/__init__.cpython-310.pyc b/vllm/multimodal/__pycache__/__init__.cpython-310.pyc
index 6add128..ab4b707 100644
Binary files a/vllm/multimodal/__pycache__/__init__.cpython-310.pyc and b/vllm/multimodal/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/multimodal/__pycache__/audio.cpython-310.pyc b/vllm/multimodal/__pycache__/audio.cpython-310.pyc
index 8b05e28..a1bbd41 100644
Binary files a/vllm/multimodal/__pycache__/audio.cpython-310.pyc and b/vllm/multimodal/__pycache__/audio.cpython-310.pyc differ
diff --git a/vllm/multimodal/__pycache__/base.cpython-310.pyc b/vllm/multimodal/__pycache__/base.cpython-310.pyc
index db52c36..b2ae847 100644
Binary files a/vllm/multimodal/__pycache__/base.cpython-310.pyc and b/vllm/multimodal/__pycache__/base.cpython-310.pyc differ
diff --git a/vllm/multimodal/__pycache__/image.cpython-310.pyc b/vllm/multimodal/__pycache__/image.cpython-310.pyc
index 0790982..d5898d2 100644
Binary files a/vllm/multimodal/__pycache__/image.cpython-310.pyc and b/vllm/multimodal/__pycache__/image.cpython-310.pyc differ
diff --git a/vllm/multimodal/__pycache__/registry.cpython-310.pyc b/vllm/multimodal/__pycache__/registry.cpython-310.pyc
index 98fe724..350ec50 100644
Binary files a/vllm/multimodal/__pycache__/registry.cpython-310.pyc and b/vllm/multimodal/__pycache__/registry.cpython-310.pyc differ
diff --git a/vllm/multimodal/__pycache__/utils.cpython-310.pyc b/vllm/multimodal/__pycache__/utils.cpython-310.pyc
index 9e3b55c..d04d3df 100644
Binary files a/vllm/multimodal/__pycache__/utils.cpython-310.pyc and b/vllm/multimodal/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/multimodal/__pycache__/video.cpython-310.pyc b/vllm/multimodal/__pycache__/video.cpython-310.pyc
index 6a318a2..e2d3163 100644
Binary files a/vllm/multimodal/__pycache__/video.cpython-310.pyc and b/vllm/multimodal/__pycache__/video.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/__init__.cpython-310.pyc b/vllm/platforms/__pycache__/__init__.cpython-310.pyc
index 07fb96c..0b50dc4 100644
Binary files a/vllm/platforms/__pycache__/__init__.cpython-310.pyc and b/vllm/platforms/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/cpu.cpython-310.pyc b/vllm/platforms/__pycache__/cpu.cpython-310.pyc
index 0dcd79b..49e4ed2 100644
Binary files a/vllm/platforms/__pycache__/cpu.cpython-310.pyc and b/vllm/platforms/__pycache__/cpu.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/cuda.cpython-310.pyc b/vllm/platforms/__pycache__/cuda.cpython-310.pyc
index 559a358..6c4935a 100644
Binary files a/vllm/platforms/__pycache__/cuda.cpython-310.pyc and b/vllm/platforms/__pycache__/cuda.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/interface.cpython-310.pyc b/vllm/platforms/__pycache__/interface.cpython-310.pyc
index 9e971de..053ec4f 100644
Binary files a/vllm/platforms/__pycache__/interface.cpython-310.pyc and b/vllm/platforms/__pycache__/interface.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/rocm.cpython-310.pyc b/vllm/platforms/__pycache__/rocm.cpython-310.pyc
index 33cda23..b1aab34 100644
Binary files a/vllm/platforms/__pycache__/rocm.cpython-310.pyc and b/vllm/platforms/__pycache__/rocm.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/tpu.cpython-310.pyc b/vllm/platforms/__pycache__/tpu.cpython-310.pyc
index 84d31fe..142ab4d 100644
Binary files a/vllm/platforms/__pycache__/tpu.cpython-310.pyc and b/vllm/platforms/__pycache__/tpu.cpython-310.pyc differ
diff --git a/vllm/platforms/__pycache__/xpu.cpython-310.pyc b/vllm/platforms/__pycache__/xpu.cpython-310.pyc
index 87134d7..b5ebb41 100644
Binary files a/vllm/platforms/__pycache__/xpu.cpython-310.pyc and b/vllm/platforms/__pycache__/xpu.cpython-310.pyc differ
diff --git a/vllm/plugins/__pycache__/__init__.cpython-310.pyc b/vllm/plugins/__pycache__/__init__.cpython-310.pyc
index fb5aa1d..01e729b 100644
Binary files a/vllm/plugins/__pycache__/__init__.cpython-310.pyc and b/vllm/plugins/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc
index b77191e..a0d7d61 100644
Binary files a/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc
index abe0d5f..8291d18 100644
Binary files a/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/layers.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc
index fb21ff6..bfb83a6 100644
Binary files a/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/models.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc
index 4cbd187..29f2651 100644
Binary files a/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/request.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc
index a143fd4..0e78e38 100644
Binary files a/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc b/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc
index 3cc1bba..24b2c33 100644
Binary files a/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc and b/vllm/prompt_adapter/__pycache__/worker_manager.cpython-310.pyc differ
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
index 4cde2a0..aae81de 100644
--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
@@ -90,6 +90,6 @@ def load_peft_weights(model_id: str,
         adapters_weights = safe_load_file(filename, device=device)
     else:
         adapters_weights = torch.load(filename,
-                                      map_location=torch.device(device))
+                                      map_location=torch.device(device), weights_only=True)
 
     return adapters_weights
diff --git a/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc b/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc
index 213d9b8..bccb28a 100644
Binary files a/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc and b/vllm/spec_decode/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc b/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc
index c07646f..6c3fbc2 100644
Binary files a/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc and b/vllm/spec_decode/__pycache__/batch_expansion.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc b/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc
index a298a18..50e0db8 100644
Binary files a/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc and b/vllm/spec_decode/__pycache__/draft_model_runner.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc b/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc
index ed50914..9a5a065 100644
Binary files a/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc and b/vllm/spec_decode/__pycache__/interfaces.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc
index d887416..325c060 100644
Binary files a/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/medusa_worker.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc b/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc
index f37fc63..9518ee3 100644
Binary files a/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc and b/vllm/spec_decode/__pycache__/metrics.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc
index da7501c..5e4f97d 100644
Binary files a/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/mlp_speculator_worker.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc b/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc
index 344f6a5..3a689b4 100644
Binary files a/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc and b/vllm/spec_decode/__pycache__/mqa_scorer.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc
index 3eae3f9..7ad4533 100644
Binary files a/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/multi_step_worker.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc
index 0da7d4e..dfc4758 100644
Binary files a/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/ngram_worker.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc b/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc
index 2b3eb51..6d326eb 100644
Binary files a/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc and b/vllm/spec_decode/__pycache__/proposer_worker_base.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc
index c47e835..c8bd07f 100644
Binary files a/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/smaller_tp_proposer_worker.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc b/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc
index a4e914a..dae197d 100644
Binary files a/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc and b/vllm/spec_decode/__pycache__/spec_decode_worker.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc b/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc
index af40176..dcefa2f 100644
Binary files a/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc and b/vllm/spec_decode/__pycache__/target_model_runner.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc b/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc
index 0e72f46..d1f72a3 100644
Binary files a/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc and b/vllm/spec_decode/__pycache__/top1_proposer.cpython-310.pyc differ
diff --git a/vllm/spec_decode/__pycache__/util.cpython-310.pyc b/vllm/spec_decode/__pycache__/util.cpython-310.pyc
index 2f06c7f..1a4ce6b 100644
Binary files a/vllm/spec_decode/__pycache__/util.cpython-310.pyc and b/vllm/spec_decode/__pycache__/util.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc
index 8b699dd..628a03d 100644
Binary files a/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/__pycache__/config.cpython-310.pyc b/vllm/transformers_utils/__pycache__/config.cpython-310.pyc
index 852d8f1..532911c 100644
Binary files a/vllm/transformers_utils/__pycache__/config.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/config.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc b/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc
index cb27203..6812464 100644
Binary files a/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/detokenizer.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc b/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc
index a535b54..92197be 100644
Binary files a/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/processor.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc b/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc
index 60b4473..191d284 100644
Binary files a/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/tokenizer.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc b/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc
index 77fb60c..4dc959c 100644
Binary files a/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc and b/vllm/transformers_utils/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc
index 400b5e3..52a9ae4 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc
index 84f5429..492608d 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/arctic.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc
index 5b934d3..b8baa1b 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc
index 2f8b67a..f014576 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/dbrx.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc
index 39a4a50..be8b08a 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/eagle.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc
index 161adbc..c39474d 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/exaone.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc
index aab95b5..49f1625 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc
index 02d777a..8477890 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/internvl.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc
index 851452d..777501a 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/jais.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc
index dd42292..ad40fe5 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/medusa.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc
index 6fcf925..e247025 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/mllama.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc
index a807a6d..8ce7a6d 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/mlp_speculator.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc
index 18a7758..07ea934 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc
index d8b8e04..693b3ca 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/nemotron.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc
index 3e46783..f9cee42 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/nvlm_d.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc
index 6d53853..32491a1 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/qwen2vl.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc
index 72f652c..17a155c 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/solar.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc b/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc
index 2385a07..103564d 100644
Binary files a/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc and b/vllm/transformers_utils/configs/__pycache__/ultravox.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc
index 65433db..f7ff6e4 100644
Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc
index 63750e8..e107728 100644
Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/base_tokenizer_group.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc
index 43c7d5f..5c3bbcc 100644
Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/ray_tokenizer_group.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc b/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc
index 4554869..0d35ced 100644
Binary files a/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc and b/vllm/transformers_utils/tokenizer_group/__pycache__/tokenizer_group.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc b/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc
index e4bba79..1e6b3a2 100644
Binary files a/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc and b/vllm/transformers_utils/tokenizers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc b/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc
index 27bceff..c3399cc 100644
Binary files a/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc and b/vllm/transformers_utils/tokenizers/__pycache__/mistral.cpython-310.pyc differ
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index aae10d3..b7e33ae 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,5 +1,5 @@
 import os
-import re
+import regex as re
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 4e19581..80f7a32 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -2,8 +2,8 @@ from vllm.triton_utils.importing import HAS_TRITON
 
 __all__ = ["HAS_TRITON"]
 
-#from vllm.triton_utils.custom_cache_manager import (
-#    maybe_set_triton_cache_manager)
-#from vllm.triton_utils.libentry import libentry
+from vllm.triton_utils.custom_cache_manager import (
+    maybe_set_triton_cache_manager)
+from vllm.triton_utils.libentry import libentry
 
 __all__ += ["maybe_set_triton_cache_manager", "libentry"]
diff --git a/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc b/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc
index 9be72bc..1ddb711 100644
Binary files a/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc and b/vllm/triton_utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc b/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc
index 63d5cd3..6359ac7 100644
Binary files a/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc and b/vllm/triton_utils/__pycache__/custom_cache_manager.cpython-310.pyc differ
diff --git a/vllm/triton_utils/__pycache__/importing.cpython-310.pyc b/vllm/triton_utils/__pycache__/importing.cpython-310.pyc
index 81c44b1..1a18938 100644
Binary files a/vllm/triton_utils/__pycache__/importing.cpython-310.pyc and b/vllm/triton_utils/__pycache__/importing.cpython-310.pyc differ
diff --git a/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc b/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc
index 5e74d2b..2aa05a1 100644
Binary files a/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc and b/vllm/triton_utils/__pycache__/libentry.cpython-310.pyc differ
diff --git a/vllm/usage/__pycache__/__init__.cpython-310.pyc b/vllm/usage/__pycache__/__init__.cpython-310.pyc
index 9320f07..2ebb663 100644
Binary files a/vllm/usage/__pycache__/__init__.cpython-310.pyc and b/vllm/usage/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/usage/__pycache__/usage_lib.cpython-310.pyc b/vllm/usage/__pycache__/usage_lib.cpython-310.pyc
index 4c0bd29..b7990bf 100644
Binary files a/vllm/usage/__pycache__/usage_lib.cpython-310.pyc and b/vllm/usage/__pycache__/usage_lib.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/__init__.cpython-310.pyc b/vllm/worker/__pycache__/__init__.cpython-310.pyc
index 9f2330d..625607b 100644
Binary files a/vllm/worker/__pycache__/__init__.cpython-310.pyc and b/vllm/worker/__pycache__/__init__.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/cache_engine.cpython-310.pyc b/vllm/worker/__pycache__/cache_engine.cpython-310.pyc
index ba2d296..3facbcf 100644
Binary files a/vllm/worker/__pycache__/cache_engine.cpython-310.pyc and b/vllm/worker/__pycache__/cache_engine.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc
index 29b091e..b6a4074 100644
Binary files a/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/cpu_enc_dec_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc
index 44e5a90..b897138 100644
Binary files a/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/cpu_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc
index c85339b..65c86b6 100644
Binary files a/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/cpu_worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc
index 3803bc6..336c63b 100644
Binary files a/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/embedding_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc
index b125020..3f44f9d 100644
Binary files a/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/enc_dec_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/model_runner.cpython-310.pyc b/vllm/worker/__pycache__/model_runner.cpython-310.pyc
index a8cef01..b85fb28 100644
Binary files a/vllm/worker/__pycache__/model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc b/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc
index 3706030..41fe8a4 100644
Binary files a/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc and b/vllm/worker/__pycache__/model_runner_base.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc
index 6a57eb8..b05eb79 100644
Binary files a/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/multi_step_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc
index 1ef4b45..fabe7d2 100644
Binary files a/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/multi_step_tpu_worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc b/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc
index c55ecd0..6144413 100644
Binary files a/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc and b/vllm/worker/__pycache__/multi_step_worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc
index 3740c52..f69ffee 100644
Binary files a/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/neuron_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc b/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc
index 5fac9ad..9a00b6b 100644
Binary files a/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc and b/vllm/worker/__pycache__/neuron_worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc
index 6237b1c..335c0e0 100644
Binary files a/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/openvino_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc b/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc
index f6fa42e..87ac044 100644
Binary files a/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc and b/vllm/worker/__pycache__/openvino_worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc
index 3a70092..6ee4b27 100644
Binary files a/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/tpu_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc
index e76dea5..0b963f6 100644
Binary files a/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/tpu_worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/utils.cpython-310.pyc b/vllm/worker/__pycache__/utils.cpython-310.pyc
index a32a86a..6e96466 100644
Binary files a/vllm/worker/__pycache__/utils.cpython-310.pyc and b/vllm/worker/__pycache__/utils.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/worker.cpython-310.pyc b/vllm/worker/__pycache__/worker.cpython-310.pyc
index 2ea3809..b65ca57 100644
Binary files a/vllm/worker/__pycache__/worker.cpython-310.pyc and b/vllm/worker/__pycache__/worker.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/worker_base.cpython-310.pyc b/vllm/worker/__pycache__/worker_base.cpython-310.pyc
index b3bdfef..a1aec6a 100644
Binary files a/vllm/worker/__pycache__/worker_base.cpython-310.pyc and b/vllm/worker/__pycache__/worker_base.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc b/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc
index 8130605..ee9897b 100644
Binary files a/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc and b/vllm/worker/__pycache__/xpu_model_runner.cpython-310.pyc differ
diff --git a/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc b/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc
index 7b6f418..c4cc9e5 100644
Binary files a/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc and b/vllm/worker/__pycache__/xpu_worker.cpython-310.pyc differ