Fix a bug in BatchTokenIDOut & Misc style and dependency updates (#7457)
This commit is contained in:
@@ -29,6 +29,7 @@ runtime_common = [
|
||||
"msgspec",
|
||||
"ninja",
|
||||
"orjson",
|
||||
"outlines==0.1.11",
|
||||
"packaging",
|
||||
"partial_json_parser",
|
||||
"pillow",
|
||||
@@ -50,13 +51,12 @@ runtime_common = [
|
||||
srt = [
|
||||
"sglang[runtime_common]",
|
||||
"sgl-kernel==0.1.9",
|
||||
"flashinfer_python==0.2.6.post1",
|
||||
"torch==2.7.1",
|
||||
"torchaudio==2.7.1",
|
||||
"torchvision==0.22.1",
|
||||
"cuda-python",
|
||||
"outlines>=0.0.44,<=0.1.11",
|
||||
"einops",
|
||||
"flashinfer_python==0.2.6.post1",
|
||||
]
|
||||
|
||||
blackwell = [
|
||||
@@ -66,7 +66,6 @@ blackwell = [
|
||||
"torchaudio==2.7.1",
|
||||
"torchvision==0.22.1",
|
||||
"cuda-python",
|
||||
"outlines>=0.0.44,<=0.1.11",
|
||||
"einops",
|
||||
"flashinfer_python==0.2.6.post1",
|
||||
]
|
||||
@@ -77,23 +76,22 @@ srt_hip = [
|
||||
"sglang[runtime_common]",
|
||||
"torch",
|
||||
"vllm==0.6.7.dev2",
|
||||
"outlines==0.1.11"
|
||||
]
|
||||
|
||||
# xpu is not enabled in public vllm and torch whl,
|
||||
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
||||
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
srt_xpu = ["sglang[runtime_common]"]
|
||||
|
||||
# For Intel Gaudi(device : hpu) follow the installation guide
|
||||
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
||||
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
srt_hpu = ["sglang[runtime_common]"]
|
||||
|
||||
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
||||
# To install vllm for CPU, please follow the instruction here:
|
||||
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
||||
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
|
||||
srt_cpu = ["sglang[runtime_common]", "einops"]
|
||||
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
|
||||
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
||||
srt_npu = ["sglang[runtime_common]"]
|
||||
|
||||
openai = ["openai>=1.0", "tiktoken"]
|
||||
anthropic = ["anthropic>=0.20.0"]
|
||||
|
||||
@@ -788,6 +788,7 @@ class Req:
|
||||
self.multimodal_inputs = None
|
||||
self.grammar = None
|
||||
self.origin_input_ids = [0] # set it to one token to skip the long prefill
|
||||
self.return_logprob = False
|
||||
self.finished_reason = FINISH_ABORT(
|
||||
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
||||
)
|
||||
|
||||
@@ -1374,7 +1374,14 @@ class Scheduler(
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
|
||||
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
||||
req_total_size = (
|
||||
self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
|
||||
)
|
||||
else:
|
||||
req_total_size = self.req_to_token_pool.size
|
||||
|
||||
if len(self.req_to_token_pool.free_slots) != req_total_size:
|
||||
msg = (
|
||||
"req_to_token_pool memory leak detected!"
|
||||
f"available_size={len(self.req_to_token_pool.free_slots)}, "
|
||||
|
||||
@@ -1226,7 +1226,7 @@ class TokenizerManager:
|
||||
state.last_output_offset = len(state.output_ids)
|
||||
else:
|
||||
state.output_ids.extend(recv_obj.output_ids[i])
|
||||
output_token_ids = state.output_ids
|
||||
output_token_ids = state.output_ids.copy()
|
||||
|
||||
out_dict = {
|
||||
"output_ids": output_token_ids,
|
||||
|
||||
@@ -1723,9 +1723,8 @@ class PortArgs:
|
||||
dist_init_host, dist_init_port = dist_init_addr
|
||||
port_base = int(dist_init_port) + 1
|
||||
if dp_rank is None:
|
||||
scheduler_input_port = (
|
||||
port_base + 3
|
||||
) # TokenizerManager to DataParallelController
|
||||
# TokenizerManager to DataParallelController
|
||||
scheduler_input_port = port_base + 3
|
||||
else:
|
||||
scheduler_input_port = port_base + 3 + 1 + dp_rank
|
||||
|
||||
|
||||
@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
|
||||
return port, host
|
||||
|
||||
|
||||
def rank0_log(msg: str):
|
||||
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
||||
|
||||
if get_tensor_model_parallel_rank() == 0:
|
||||
logger.info(msg)
|
||||
|
||||
|
||||
def rank0_print(msg: str):
|
||||
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
||||
|
||||
@@ -1931,6 +1924,9 @@ def rank0_print(msg: str):
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
rank0_log = rank0_print
|
||||
|
||||
|
||||
def get_cuda_version():
|
||||
if torch.version.cuda:
|
||||
return tuple(map(int, torch.version.cuda.split(".")))
|
||||
|
||||
Reference in New Issue
Block a user