Fix a bug in BatchTokenIDOut & Misc style and dependency updates (#7457)
This commit is contained in:
6
.github/workflows/pr-test.yml
vendored
6
.github/workflows/pr-test.yml
vendored
@@ -113,6 +113,10 @@ jobs:
|
|||||||
github.event.pull_request.draft == false
|
github.event.pull_request.draft == false
|
||||||
needs: [unit-test-frontend, unit-test-backend-2-gpu]
|
needs: [unit-test-frontend, unit-test-backend-2-gpu]
|
||||||
runs-on: 8-gpu-runner
|
runs-on: 8-gpu-runner
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
part: [0, 1]
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -125,7 +129,7 @@ jobs:
|
|||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 run_suite.py --suite per-commit-8-gpu
|
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||||
|
|
||||||
performance-test-1-gpu-part-1:
|
performance-test-1-gpu-part-1:
|
||||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ runtime_common = [
|
|||||||
"msgspec",
|
"msgspec",
|
||||||
"ninja",
|
"ninja",
|
||||||
"orjson",
|
"orjson",
|
||||||
|
"outlines==0.1.11",
|
||||||
"packaging",
|
"packaging",
|
||||||
"partial_json_parser",
|
"partial_json_parser",
|
||||||
"pillow",
|
"pillow",
|
||||||
@@ -50,13 +51,12 @@ runtime_common = [
|
|||||||
srt = [
|
srt = [
|
||||||
"sglang[runtime_common]",
|
"sglang[runtime_common]",
|
||||||
"sgl-kernel==0.1.9",
|
"sgl-kernel==0.1.9",
|
||||||
"flashinfer_python==0.2.6.post1",
|
|
||||||
"torch==2.7.1",
|
"torch==2.7.1",
|
||||||
"torchaudio==2.7.1",
|
"torchaudio==2.7.1",
|
||||||
"torchvision==0.22.1",
|
"torchvision==0.22.1",
|
||||||
"cuda-python",
|
"cuda-python",
|
||||||
"outlines>=0.0.44,<=0.1.11",
|
|
||||||
"einops",
|
"einops",
|
||||||
|
"flashinfer_python==0.2.6.post1",
|
||||||
]
|
]
|
||||||
|
|
||||||
blackwell = [
|
blackwell = [
|
||||||
@@ -66,7 +66,6 @@ blackwell = [
|
|||||||
"torchaudio==2.7.1",
|
"torchaudio==2.7.1",
|
||||||
"torchvision==0.22.1",
|
"torchvision==0.22.1",
|
||||||
"cuda-python",
|
"cuda-python",
|
||||||
"outlines>=0.0.44,<=0.1.11",
|
|
||||||
"einops",
|
"einops",
|
||||||
"flashinfer_python==0.2.6.post1",
|
"flashinfer_python==0.2.6.post1",
|
||||||
]
|
]
|
||||||
@@ -77,23 +76,22 @@ srt_hip = [
|
|||||||
"sglang[runtime_common]",
|
"sglang[runtime_common]",
|
||||||
"torch",
|
"torch",
|
||||||
"vllm==0.6.7.dev2",
|
"vllm==0.6.7.dev2",
|
||||||
"outlines==0.1.11"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# xpu is not enabled in public vllm and torch whl,
|
# xpu is not enabled in public vllm and torch whl,
|
||||||
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
||||||
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
srt_xpu = ["sglang[runtime_common]"]
|
||||||
|
|
||||||
# For Intel Gaudi(device : hpu) follow the installation guide
|
# For Intel Gaudi(device : hpu) follow the installation guide
|
||||||
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
||||||
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
srt_hpu = ["sglang[runtime_common]"]
|
||||||
|
|
||||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||||
# To install vllm for CPU, please follow the instruction here:
|
# To install vllm for CPU, please follow the instruction here:
|
||||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
|
srt_cpu = ["sglang[runtime_common]", "einops"]
|
||||||
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
||||||
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
srt_npu = ["sglang[runtime_common]"]
|
||||||
|
|
||||||
openai = ["openai>=1.0", "tiktoken"]
|
openai = ["openai>=1.0", "tiktoken"]
|
||||||
anthropic = ["anthropic>=0.20.0"]
|
anthropic = ["anthropic>=0.20.0"]
|
||||||
|
|||||||
@@ -788,6 +788,7 @@ class Req:
|
|||||||
self.multimodal_inputs = None
|
self.multimodal_inputs = None
|
||||||
self.grammar = None
|
self.grammar = None
|
||||||
self.origin_input_ids = [0] # set it to one token to skip the long prefill
|
self.origin_input_ids = [0] # set it to one token to skip the long prefill
|
||||||
|
self.return_logprob = False
|
||||||
self.finished_reason = FINISH_ABORT(
|
self.finished_reason = FINISH_ABORT(
|
||||||
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1374,7 +1374,14 @@ class Scheduler(
|
|||||||
)
|
)
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
|
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
||||||
|
req_total_size = (
|
||||||
|
self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
req_total_size = self.req_to_token_pool.size
|
||||||
|
|
||||||
|
if len(self.req_to_token_pool.free_slots) != req_total_size:
|
||||||
msg = (
|
msg = (
|
||||||
"req_to_token_pool memory leak detected!"
|
"req_to_token_pool memory leak detected!"
|
||||||
f"available_size={len(self.req_to_token_pool.free_slots)}, "
|
f"available_size={len(self.req_to_token_pool.free_slots)}, "
|
||||||
|
|||||||
@@ -1226,7 +1226,7 @@ class TokenizerManager:
|
|||||||
state.last_output_offset = len(state.output_ids)
|
state.last_output_offset = len(state.output_ids)
|
||||||
else:
|
else:
|
||||||
state.output_ids.extend(recv_obj.output_ids[i])
|
state.output_ids.extend(recv_obj.output_ids[i])
|
||||||
output_token_ids = state.output_ids
|
output_token_ids = state.output_ids.copy()
|
||||||
|
|
||||||
out_dict = {
|
out_dict = {
|
||||||
"output_ids": output_token_ids,
|
"output_ids": output_token_ids,
|
||||||
|
|||||||
@@ -1723,9 +1723,8 @@ class PortArgs:
|
|||||||
dist_init_host, dist_init_port = dist_init_addr
|
dist_init_host, dist_init_port = dist_init_addr
|
||||||
port_base = int(dist_init_port) + 1
|
port_base = int(dist_init_port) + 1
|
||||||
if dp_rank is None:
|
if dp_rank is None:
|
||||||
scheduler_input_port = (
|
# TokenizerManager to DataParallelController
|
||||||
port_base + 3
|
scheduler_input_port = port_base + 3
|
||||||
) # TokenizerManager to DataParallelController
|
|
||||||
else:
|
else:
|
||||||
scheduler_input_port = port_base + 3 + 1 + dp_rank
|
scheduler_input_port = port_base + 3 + 1 + dp_rank
|
||||||
|
|
||||||
|
|||||||
@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
|
|||||||
return port, host
|
return port, host
|
||||||
|
|
||||||
|
|
||||||
def rank0_log(msg: str):
|
|
||||||
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
|
||||||
|
|
||||||
if get_tensor_model_parallel_rank() == 0:
|
|
||||||
logger.info(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def rank0_print(msg: str):
|
def rank0_print(msg: str):
|
||||||
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
||||||
|
|
||||||
@@ -1931,6 +1924,9 @@ def rank0_print(msg: str):
|
|||||||
print(msg, flush=True)
|
print(msg, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
rank0_log = rank0_print
|
||||||
|
|
||||||
|
|
||||||
def get_cuda_version():
|
def get_cuda_version():
|
||||||
if torch.version.cuda:
|
if torch.version.cuda:
|
||||||
return tuple(map(int, torch.version.cuda.split(".")))
|
return tuple(map(int, torch.version.cuda.split(".")))
|
||||||
|
|||||||
@@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
|
|||||||
# clean Torch Flag
|
# clean Torch Flag
|
||||||
clear_cuda_arches(CMAKE_FLAG)
|
clear_cuda_arches(CMAKE_FLAG)
|
||||||
|
|
||||||
if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
|
|
||||||
set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
|
|
||||||
set(DeepGEMM_TAG "blackwell")
|
|
||||||
else()
|
|
||||||
set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
|
|
||||||
set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
|
|
||||||
# cutlass
|
# cutlass
|
||||||
@@ -57,7 +49,16 @@ FetchContent_Declare(
|
|||||||
GIT_SHALLOW OFF
|
GIT_SHALLOW OFF
|
||||||
)
|
)
|
||||||
FetchContent_Populate(repo-cutlass)
|
FetchContent_Populate(repo-cutlass)
|
||||||
|
|
||||||
# DeepGEMM
|
# DeepGEMM
|
||||||
|
if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
|
||||||
|
set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
|
||||||
|
set(DeepGEMM_TAG "blackwell")
|
||||||
|
else()
|
||||||
|
set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
|
||||||
|
set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
|
||||||
|
endif()
|
||||||
|
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
repo-deepgemm
|
repo-deepgemm
|
||||||
GIT_REPOSITORY ${DeepGEMM_REPO}
|
GIT_REPOSITORY ${DeepGEMM_REPO}
|
||||||
@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
|||||||
message(STATUS "For aarch64, disable gencode below SM90 by default")
|
message(STATUS "For aarch64, disable gencode below SM90 by default")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
${PROJECT_SOURCE_DIR}/include
|
${PROJECT_SOURCE_DIR}/include
|
||||||
${PROJECT_SOURCE_DIR}/csrc
|
${PROJECT_SOURCE_DIR}/csrc
|
||||||
@@ -247,8 +247,8 @@ set(SOURCES
|
|||||||
"csrc/moe/ep_moe_reorder_kernel.cu"
|
"csrc/moe/ep_moe_reorder_kernel.cu"
|
||||||
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
|
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
|
||||||
"csrc/speculative/eagle_utils.cu"
|
"csrc/speculative/eagle_utils.cu"
|
||||||
"csrc/speculative/speculative_sampling.cu"
|
|
||||||
"csrc/speculative/packbit.cu"
|
"csrc/speculative/packbit.cu"
|
||||||
|
"csrc/speculative/speculative_sampling.cu"
|
||||||
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
|
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
|
||||||
"csrc/common_extension.cc"
|
"csrc/common_extension.cc"
|
||||||
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
|
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream
|
from sgl_kernel.utils import _to_tensor_scalar_tuple
|
||||||
|
|
||||||
|
|
||||||
def _top_k_renorm_probs_internal(
|
def _top_k_renorm_probs_internal(
|
||||||
|
|||||||
Reference in New Issue
Block a user