Update to new version of base image

This commit is contained in:
2025-10-24 15:45:06 +08:00
parent ee04aead1e
commit fad74b701b
476 changed files with 1270 additions and 46 deletions

View File

@@ -1,19 +1,15 @@
FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.1-x86-ubuntu20.04-py3.10-poc-llm-infer:20250731115755 FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.3-x86-ubuntu20.04-py3.10-poc-llm-infer:v1.2.3
RUN pip install --no-cache-dir triton==2.1.0 RUN pip install --no-cache-dir triton==2.1.0
COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton
COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info
COPY pkgs/xformers-0.0.22+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/xformers-0.0.22+corex.4.1.2.dist-info
COPY pkgs/xformers /usr/local/corex/lib64/python3/dist-packages/xformers
COPY paged_attn.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/paged_attn.py COPY paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
COPY __init__.py /usr/local/lib/python3.10/site-packages/vllm/triton_utils/__init__.py COPY __init__.py /usr/local/corex/lib64/python3/dist-packages/vllm/triton_utils/__init__.py
COPY prefix_prefill.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/prefix_prefill.py
RUN mkdir /workspace RUN mkdir /workspace
WORKDIR /workspace/ WORKDIR /workspace/
COPY ./launch_service /workspace/launch_service COPY ./launch_service /workspace/launch_service
ENTRYPOINT ["./launch_service"]

View File

@@ -4,6 +4,7 @@ from typing import List, Optional, Tuple
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.attention.ops.prefix_prefill import context_attention_fwd
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.

View File

@@ -1102,4 +1102,4 @@ for k, v in names_and_values.items():
names_and_values_to_update[k] = hint_on_error(v) names_and_values_to_update[k] = hint_on_error(v)
names_and_values.update(names_and_values_to_update) names_and_values.update(names_and_values_to_update)
del names_and_values_to_update, names_and_values, v, k, fn_type del names_and_values_to_update, names_and_values, v, k, fn_type

View File

@@ -27,4 +27,4 @@ class ImageAsset:
""" """
image_path = get_vllm_public_assets(filename=f"{self.name}.pt", image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
s3_prefix=VLM_IMAGES_DIR) s3_prefix=VLM_IMAGES_DIR)
return torch.load(image_path) return torch.load(image_path, weights_only=True)

View File

@@ -4,7 +4,10 @@ from typing import List, Optional, Tuple
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.triton_utils import HAS_TRITON
if HAS_TRITON:
from vllm.attention.ops.prefix_prefill import context_attention_fwd
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
_PARTITION_SIZE = 512 _PARTITION_SIZE = 512

View File

@@ -808,8 +808,6 @@ if triton.__version__ >= "2.1.0":
) )
return return
import time
ts_beg = time.time()
_fwd_kernel[grid]( _fwd_kernel[grid](
q, q,
k, k,
@@ -860,6 +858,4 @@ if triton.__version__ >= "2.1.0":
num_warps=NUM_WARPS, num_warps=NUM_WARPS,
num_stages=1, num_stages=1,
) )
elapsed = time.time() - ts_beg
#print(f'{elapsed}: {BLOCK=}, {Lk=}, {Lk_padded=}, {BLOCK=}, {sliding_window=}, {NUM_WARPS=}')
return return

View File

@@ -204,9 +204,6 @@ def which_attn_to_use(
if selected_backend != _Backend.PALLAS: if selected_backend != _Backend.PALLAS:
logger.info("Cannot use %s backend on TPU.", selected_backend) logger.info("Cannot use %s backend on TPU.", selected_backend)
return _Backend.PALLAS return _Backend.PALLAS
if selected_backend == _Backend.FLASH_ATTN:
print("selected_backend == _Backend.FLASH_ATTN")
if is_hip(): if is_hip():
# AMD GPUs. # AMD GPUs.

View File

@@ -217,7 +217,8 @@ class MessageQueue:
remote_subscribe_port = get_open_port() remote_subscribe_port = get_open_port()
if is_valid_ipv6_address(connect_ip): if is_valid_ipv6_address(connect_ip):
self.remote_socket.setsockopt(IPV6, 1) self.remote_socket.setsockopt(IPV6, 1)
socket_addr = f"tcp://*:{remote_subscribe_port}" connect_ip = f"[{connect_ip}]"
socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
self.remote_socket.bind(socket_addr) self.remote_socket.bind(socket_addr)
else: else:

Some files were not shown because too many files have changed in this diff Show More