Update to new version of base image
This commit is contained in:
10
Dockerfile
10
Dockerfile
@@ -1,19 +1,15 @@
|
|||||||
FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.1-x86-ubuntu20.04-py3.10-poc-llm-infer:20250731115755
|
FROM git.modelhub.org.cn:9443/enginex-iluvatar/bi100-3.2.3-x86-ubuntu20.04-py3.10-poc-llm-infer:v1.2.3
|
||||||
|
|
||||||
RUN pip install --no-cache-dir triton==2.1.0
|
RUN pip install --no-cache-dir triton==2.1.0
|
||||||
|
|
||||||
COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton
|
COPY pkgs/triton /usr/local/corex/lib64/python3/dist-packages/triton
|
||||||
COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info
|
COPY pkgs/triton-2.1.0+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/triton-2.1.0+corex.4.1.2.dist-info
|
||||||
COPY pkgs/xformers-0.0.22+corex.4.1.2.dist-info /usr/local/corex/lib64/python3/dist-packages/xformers-0.0.22+corex.4.1.2.dist-info
|
|
||||||
COPY pkgs/xformers /usr/local/corex/lib64/python3/dist-packages/xformers
|
|
||||||
|
|
||||||
COPY paged_attn.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/paged_attn.py
|
COPY paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
|
||||||
COPY __init__.py /usr/local/lib/python3.10/site-packages/vllm/triton_utils/__init__.py
|
COPY __init__.py /usr/local/corex/lib64/python3/dist-packages/vllm/triton_utils/__init__.py
|
||||||
COPY prefix_prefill.py /usr/local/lib/python3.10/site-packages/vllm/attention/ops/prefix_prefill.py
|
|
||||||
|
|
||||||
RUN mkdir /workspace
|
RUN mkdir /workspace
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
COPY ./launch_service /workspace/launch_service
|
COPY ./launch_service /workspace/launch_service
|
||||||
|
|
||||||
ENTRYPOINT ["./launch_service"]
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from typing import List, Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||||
|
|
||||||
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1102,4 +1102,4 @@ for k, v in names_and_values.items():
|
|||||||
names_and_values_to_update[k] = hint_on_error(v)
|
names_and_values_to_update[k] = hint_on_error(v)
|
||||||
|
|
||||||
names_and_values.update(names_and_values_to_update)
|
names_and_values.update(names_and_values_to_update)
|
||||||
del names_and_values_to_update, names_and_values, v, k, fn_type
|
del names_and_values_to_update, names_and_values, v, k, fn_type
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -27,4 +27,4 @@ class ImageAsset:
|
|||||||
"""
|
"""
|
||||||
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
||||||
s3_prefix=VLM_IMAGES_DIR)
|
s3_prefix=VLM_IMAGES_DIR)
|
||||||
return torch.load(image_path)
|
return torch.load(image_path, weights_only=True)
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -4,7 +4,10 @@ from typing import List, Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
from vllm.triton_utils import HAS_TRITON
|
||||||
|
|
||||||
|
if HAS_TRITON:
|
||||||
|
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||||
|
|
||||||
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
||||||
_PARTITION_SIZE = 512
|
_PARTITION_SIZE = 512
|
||||||
|
|||||||
@@ -808,8 +808,6 @@ if triton.__version__ >= "2.1.0":
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
import time
|
|
||||||
ts_beg = time.time()
|
|
||||||
_fwd_kernel[grid](
|
_fwd_kernel[grid](
|
||||||
q,
|
q,
|
||||||
k,
|
k,
|
||||||
@@ -860,6 +858,4 @@ if triton.__version__ >= "2.1.0":
|
|||||||
num_warps=NUM_WARPS,
|
num_warps=NUM_WARPS,
|
||||||
num_stages=1,
|
num_stages=1,
|
||||||
)
|
)
|
||||||
elapsed = time.time() - ts_beg
|
|
||||||
#print(f'{elapsed}: {BLOCK=}, {Lk=}, {Lk_padded=}, {BLOCK=}, {sliding_window=}, {NUM_WARPS=}')
|
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -204,9 +204,6 @@ def which_attn_to_use(
|
|||||||
if selected_backend != _Backend.PALLAS:
|
if selected_backend != _Backend.PALLAS:
|
||||||
logger.info("Cannot use %s backend on TPU.", selected_backend)
|
logger.info("Cannot use %s backend on TPU.", selected_backend)
|
||||||
return _Backend.PALLAS
|
return _Backend.PALLAS
|
||||||
|
|
||||||
if selected_backend == _Backend.FLASH_ATTN:
|
|
||||||
print("selected_backend == _Backend.FLASH_ATTN")
|
|
||||||
|
|
||||||
if is_hip():
|
if is_hip():
|
||||||
# AMD GPUs.
|
# AMD GPUs.
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -217,7 +217,8 @@ class MessageQueue:
|
|||||||
remote_subscribe_port = get_open_port()
|
remote_subscribe_port = get_open_port()
|
||||||
if is_valid_ipv6_address(connect_ip):
|
if is_valid_ipv6_address(connect_ip):
|
||||||
self.remote_socket.setsockopt(IPV6, 1)
|
self.remote_socket.setsockopt(IPV6, 1)
|
||||||
socket_addr = f"tcp://*:{remote_subscribe_port}"
|
connect_ip = f"[{connect_ip}]"
|
||||||
|
socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
|
||||||
self.remote_socket.bind(socket_addr)
|
self.remote_socket.bind(socket_addr)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user