Update to new version of base image

2025-10-24 15:45:06 +08:00
parent ee04aead1e
commit fad74b701b
476 changed files with 1270 additions and 46 deletions
--- a/10
+++ b/10
@@ -1,19 +1,15 @@
-FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.1-x86-ubuntu20.04-py3.10-poc-llm-infer:20250731115755
+FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.3-x86-ubuntu20.04-py3.10-poc-llm-infer:v1.2.3

 RUN pip install --no-cache-dir triton==2.1.0

 COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton
 COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info
-COPY pkgs/xformers-0.0.22+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/xformers-0.0.22+corex.4.1.2.dist-info
-COPY pkgs/xformers /usr/local/corex/lib64/python3/dist-packages/xformers

-COPY paged_attn.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/paged_attn.py
-COPY __init__.py /usr/local/lib/python3.10/site-packages/vllm/triton_utils/__init__.py
-COPY prefix_prefill.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/prefix_prefill.py
+COPY paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
+COPY __init__.py /usr/local/corex/lib64/python3/dist-packages/vllm/triton_utils/__init__.py

 RUN mkdir /workspace
 WORKDIR /workspace/

 COPY ./launch_service /workspace/launch_service

-ENTRYPOINT ["./launch_service"]
--- a/paged_attn.py
+++ b/paged_attn.py
@@ -4,6 +4,7 @@ from typing import List, Optional, Tuple
 import torch

 from vllm import _custom_ops as ops
+
 from vllm.attention.ops.prefix_prefill import context_attention_fwd

 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
--- a/vllm/pycache/init.cpython-310.pyc
+++ b/vllm/pycache/init.cpython-310.pyc
--- a/vllm/pycache/_core_ext.cpython-310.pyc
+++ b/vllm/pycache/_core_ext.cpython-310.pyc
--- a/vllm/pycache/_custom_ops.cpython-310.pyc
+++ b/vllm/pycache/_custom_ops.cpython-310.pyc
--- a/vllm/pycache/_ipex_ops.cpython-310.pyc
+++ b/vllm/pycache/_ipex_ops.cpython-310.pyc
--- a/vllm/pycache/beam_search.cpython-310.pyc
+++ b/vllm/pycache/beam_search.cpython-310.pyc
--- a/vllm/pycache/block.cpython-310.pyc
+++ b/vllm/pycache/block.cpython-310.pyc
--- a/vllm/pycache/config.cpython-310.pyc
+++ b/vllm/pycache/config.cpython-310.pyc
--- a/vllm/pycache/connections.cpython-310.pyc
+++ b/vllm/pycache/connections.cpython-310.pyc
--- a/vllm/pycache/envs.cpython-310.pyc
+++ b/vllm/pycache/envs.cpython-310.pyc
--- a/vllm/pycache/forward_context.cpython-310.pyc
+++ b/vllm/pycache/forward_context.cpython-310.pyc
--- a/vllm/pycache/logger.cpython-310.pyc
+++ b/vllm/pycache/logger.cpython-310.pyc
--- a/vllm/pycache/outputs.cpython-310.pyc
+++ b/vllm/pycache/outputs.cpython-310.pyc
--- a/vllm/pycache/pooling_params.cpython-310.pyc
+++ b/vllm/pycache/pooling_params.cpython-310.pyc
--- a/vllm/pycache/sampling_params.cpython-310.pyc
+++ b/vllm/pycache/sampling_params.cpython-310.pyc
--- a/vllm/pycache/scalar_type.cpython-310.pyc
+++ b/vllm/pycache/scalar_type.cpython-310.pyc
--- a/vllm/pycache/scripts.cpython-310.pyc
+++ b/vllm/pycache/scripts.cpython-310.pyc
--- a/vllm/pycache/sequence.cpython-310.pyc
+++ b/vllm/pycache/sequence.cpython-310.pyc
--- a/vllm/pycache/tracing.cpython-310.pyc
+++ b/vllm/pycache/tracing.cpython-310.pyc
--- a/vllm/pycache/utils.cpython-310.pyc
+++ b/vllm/pycache/utils.cpython-310.pyc
--- a/vllm/pycache/version.cpython-310.pyc
+++ b/vllm/pycache/version.cpython-310.pyc
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1102,4 +1102,4 @@ for k, v in names_and_values.items():
        names_and_values_to_update[k] = hint_on_error(v)

 names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type
+del names_and_values_to_update, names_and_values, v, k, fn_type
--- a/vllm/adapter_commons/pycache/init.cpython-310.pyc
+++ b/vllm/adapter_commons/pycache/init.cpython-310.pyc
--- a/vllm/adapter_commons/pycache/layers.cpython-310.pyc
+++ b/vllm/adapter_commons/pycache/layers.cpython-310.pyc
--- a/vllm/adapter_commons/pycache/models.cpython-310.pyc
+++ b/vllm/adapter_commons/pycache/models.cpython-310.pyc
--- a/vllm/adapter_commons/pycache/request.cpython-310.pyc
+++ b/vllm/adapter_commons/pycache/request.cpython-310.pyc
--- a/vllm/adapter_commons/pycache/utils.cpython-310.pyc
+++ b/vllm/adapter_commons/pycache/utils.cpython-310.pyc
--- a/vllm/adapter_commons/pycache/worker_manager.cpython-310.pyc
+++ b/vllm/adapter_commons/pycache/worker_manager.cpython-310.pyc
--- a/vllm/assets/pycache/init.cpython-310.pyc
+++ b/vllm/assets/pycache/init.cpython-310.pyc
--- a/vllm/assets/pycache/audio.cpython-310.pyc
+++ b/vllm/assets/pycache/audio.cpython-310.pyc
--- a/vllm/assets/pycache/base.cpython-310.pyc
+++ b/vllm/assets/pycache/base.cpython-310.pyc
--- a/vllm/assets/pycache/image.cpython-310.pyc
+++ b/vllm/assets/pycache/image.cpython-310.pyc
--- a/vllm/assets/pycache/video.cpython-310.pyc
+++ b/vllm/assets/pycache/video.cpython-310.pyc
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -27,4 +27,4 @@ class ImageAsset:
        """
        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                            s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, weights_only=True)
--- a/vllm/attention/pycache/init.cpython-310.pyc
+++ b/vllm/attention/pycache/init.cpython-310.pyc
--- a/vllm/attention/pycache/layer.cpython-310.pyc
+++ b/vllm/attention/pycache/layer.cpython-310.pyc
--- a/vllm/attention/pycache/selector.cpython-310.pyc
+++ b/vllm/attention/pycache/selector.cpython-310.pyc
--- a/vllm/attention/backends/pycache/init.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/init.cpython-310.pyc
--- a/vllm/attention/backends/pycache/abstract.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/abstract.cpython-310.pyc
--- a/vllm/attention/backends/pycache/blocksparse_attn.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/blocksparse_attn.cpython-310.pyc
--- a/vllm/attention/backends/pycache/flash_attn.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/flash_attn.cpython-310.pyc
--- a/vllm/attention/backends/pycache/flashinfer.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/flashinfer.cpython-310.pyc
--- a/vllm/attention/backends/pycache/ipex_attn.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/ipex_attn.cpython-310.pyc
--- a/vllm/attention/backends/pycache/openvino.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/openvino.cpython-310.pyc
--- a/vllm/attention/backends/pycache/pallas.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/pallas.cpython-310.pyc
--- a/vllm/attention/backends/pycache/placeholder_attn.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/placeholder_attn.cpython-310.pyc
--- a/vllm/attention/backends/pycache/rocm_flash_attn.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/rocm_flash_attn.cpython-310.pyc
--- a/vllm/attention/backends/pycache/torch_sdpa.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/torch_sdpa.cpython-310.pyc
--- a/vllm/attention/backends/pycache/utils.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/utils.cpython-310.pyc
--- a/vllm/attention/backends/pycache/xformers.cpython-310.pyc
+++ b/vllm/attention/backends/pycache/xformers.cpython-310.pyc
--- a/vllm/attention/ops/pycache/init.cpython-310.pyc
+++ b/vllm/attention/ops/pycache/init.cpython-310.pyc
--- a/vllm/attention/ops/pycache/ipex_attn.cpython-310.pyc
+++ b/vllm/attention/ops/pycache/ipex_attn.cpython-310.pyc
--- a/vllm/attention/ops/pycache/paged_attn.cpython-310.pyc
+++ b/vllm/attention/ops/pycache/paged_attn.cpython-310.pyc
--- a/vllm/attention/ops/pycache/prefix_prefill.cpython-310.pyc
+++ b/vllm/attention/ops/pycache/prefix_prefill.cpython-310.pyc
--- a/vllm/attention/ops/pycache/triton_flash_attention.cpython-310.pyc
+++ b/vllm/attention/ops/pycache/triton_flash_attention.cpython-310.pyc
--- a/vllm/attention/ops/blocksparse_attention/pycache/init.cpython-310.pyc
+++ b/vllm/attention/ops/blocksparse_attention/pycache/init.cpython-310.pyc
--- a/vllm/attention/ops/blocksparse_attention/pycache/blocksparse_attention_kernel.cpython-310.pyc
+++ b/vllm/attention/ops/blocksparse_attention/pycache/blocksparse_attention_kernel.cpython-310.pyc
--- a/vllm/attention/ops/blocksparse_attention/pycache/interface.cpython-310.pyc
+++ b/vllm/attention/ops/blocksparse_attention/pycache/interface.cpython-310.pyc
--- a/vllm/attention/ops/blocksparse_attention/pycache/utils.cpython-310.pyc
+++ b/vllm/attention/ops/blocksparse_attention/pycache/utils.cpython-310.pyc
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -4,7 +4,10 @@ from typing import List, Optional, Tuple
 import torch

 from vllm import _custom_ops as ops
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.attention.ops.prefix_prefill import context_attention_fwd

 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -808,8 +808,6 @@ if triton.__version__ >= "2.1.0":
            )
            return

-        import time
-        ts_beg = time.time()
        _fwd_kernel[grid](
            q,
            k,
@@ -860,6 +858,4 @@ if triton.__version__ >= "2.1.0":
            num_warps=NUM_WARPS,
            num_stages=1,
        )
-        elapsed = time.time() - ts_beg
-        #print(f'{elapsed}: {BLOCK=}, {Lk=}, {Lk_padded=}, {BLOCK=}, {sliding_window=}, {NUM_WARPS=}')
        return
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -204,9 +204,6 @@ def which_attn_to_use(
        if selected_backend != _Backend.PALLAS:
            logger.info("Cannot use %s backend on TPU.", selected_backend)
        return _Backend.PALLAS
-    
-    if selected_backend == _Backend.FLASH_ATTN:
-        print("selected_backend == _Backend.FLASH_ATTN")

    if is_hip():
        # AMD GPUs.
--- a/vllm/compilation/pycache/init.cpython-310.pyc
+++ b/vllm/compilation/pycache/init.cpython-310.pyc
--- a/vllm/compilation/pycache/backends.cpython-310.pyc
+++ b/vllm/compilation/pycache/backends.cpython-310.pyc
--- a/vllm/compilation/pycache/compile_context.cpython-310.pyc
+++ b/vllm/compilation/pycache/compile_context.cpython-310.pyc
--- a/vllm/compilation/pycache/decorators.cpython-310.pyc
+++ b/vllm/compilation/pycache/decorators.cpython-310.pyc
--- a/vllm/compilation/pycache/levels.cpython-310.pyc
+++ b/vllm/compilation/pycache/levels.cpython-310.pyc
--- a/vllm/compilation/pycache/wrapper.cpython-310.pyc
+++ b/vllm/compilation/pycache/wrapper.cpython-310.pyc
--- a/vllm/core/pycache/init.cpython-310.pyc
+++ b/vllm/core/pycache/init.cpython-310.pyc
--- a/vllm/core/pycache/block_manager_v1.cpython-310.pyc
+++ b/vllm/core/pycache/block_manager_v1.cpython-310.pyc
--- a/vllm/core/pycache/block_manager_v2.cpython-310.pyc
+++ b/vllm/core/pycache/block_manager_v2.cpython-310.pyc
--- a/vllm/core/pycache/evictor_v1.cpython-310.pyc
+++ b/vllm/core/pycache/evictor_v1.cpython-310.pyc
--- a/vllm/core/pycache/evictor_v2.cpython-310.pyc
+++ b/vllm/core/pycache/evictor_v2.cpython-310.pyc
--- a/vllm/core/pycache/interfaces.cpython-310.pyc
+++ b/vllm/core/pycache/interfaces.cpython-310.pyc
--- a/vllm/core/pycache/placeholder_block_space_manager.cpython-310.pyc
+++ b/vllm/core/pycache/placeholder_block_space_manager.cpython-310.pyc
--- a/vllm/core/pycache/scheduler.cpython-310.pyc
+++ b/vllm/core/pycache/scheduler.cpython-310.pyc
--- a/vllm/core/block/pycache/init.cpython-310.pyc
+++ b/vllm/core/block/pycache/init.cpython-310.pyc
--- a/vllm/core/block/pycache/block_table.cpython-310.pyc
+++ b/vllm/core/block/pycache/block_table.cpython-310.pyc
--- a/vllm/core/block/pycache/common.cpython-310.pyc
+++ b/vllm/core/block/pycache/common.cpython-310.pyc
--- a/vllm/core/block/pycache/cpu_gpu_block_allocator.cpython-310.pyc
+++ b/vllm/core/block/pycache/cpu_gpu_block_allocator.cpython-310.pyc
--- a/vllm/core/block/pycache/interfaces.cpython-310.pyc
+++ b/vllm/core/block/pycache/interfaces.cpython-310.pyc
--- a/vllm/core/block/pycache/naive_block.cpython-310.pyc
+++ b/vllm/core/block/pycache/naive_block.cpython-310.pyc
--- a/vllm/core/block/pycache/prefix_caching_block.cpython-310.pyc
+++ b/vllm/core/block/pycache/prefix_caching_block.cpython-310.pyc
--- a/vllm/core/block/pycache/utils.cpython-310.pyc
+++ b/vllm/core/block/pycache/utils.cpython-310.pyc
--- a/vllm/distributed/pycache/init.cpython-310.pyc
+++ b/vllm/distributed/pycache/init.cpython-310.pyc
--- a/vllm/distributed/pycache/communication_op.cpython-310.pyc
+++ b/vllm/distributed/pycache/communication_op.cpython-310.pyc
--- a/vllm/distributed/pycache/parallel_state.cpython-310.pyc
+++ b/vllm/distributed/pycache/parallel_state.cpython-310.pyc
--- a/vllm/distributed/pycache/utils.cpython-310.pyc
+++ b/vllm/distributed/pycache/utils.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/init.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/init.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/cuda_wrapper.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/cuda_wrapper.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/custom_all_reduce.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/custom_all_reduce.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/custom_all_reduce_utils.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/custom_all_reduce_utils.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/pynccl.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/pynccl.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/pynccl_wrapper.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/pynccl_wrapper.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/shm_broadcast.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/shm_broadcast.cpython-310.pyc
--- a/vllm/distributed/device_communicators/pycache/tpu_communicator.cpython-310.pyc
+++ b/vllm/distributed/device_communicators/pycache/tpu_communicator.cpython-310.pyc
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -217,7 +217,8 @@ class MessageQueue:
            remote_subscribe_port = get_open_port()
            if is_valid_ipv6_address(connect_ip):
                self.remote_socket.setsockopt(IPV6, 1)
-            socket_addr = f"tcp://*:{remote_subscribe_port}"
+                connect_ip = f"[{connect_ip}]"
+            socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
            self.remote_socket.bind(socket_addr)

        else:
--- a/vllm/engine/pycache/init.cpython-310.pyc
+++ b/vllm/engine/pycache/init.cpython-310.pyc
--- a/vllm/engine/pycache/arg_utils.cpython-310.pyc
+++ b/vllm/engine/pycache/arg_utils.cpython-310.pyc
--- a/Show More
+++ b/Show More