Fix flashinfer (#430)
This commit is contained in:
@@ -20,7 +20,7 @@ dependencies = [
|
|||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
||||||
"zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "flashinfer>=0.0.4", "packaging"]
|
"zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "packaging"]
|
||||||
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
||||||
anthropic = ["anthropic>=0.20.0", "numpy"]
|
anthropic = ["anthropic>=0.20.0", "numpy"]
|
||||||
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
|
||||||
|
|||||||
@@ -113,7 +113,8 @@ class ModelRpcServer:
|
|||||||
f"max_prefill_num_token={self.max_prefill_num_token}, "
|
f"max_prefill_num_token={self.max_prefill_num_token}, "
|
||||||
f"context_len={self.model_config.context_len}, "
|
f"context_len={self.model_config.context_len}, "
|
||||||
)
|
)
|
||||||
logger.info(f"server_args: {server_args.print_mode_args()}")
|
if self.tp_rank == 0:
|
||||||
|
logger.info(f"server_args: {server_args.print_mode_args()}")
|
||||||
|
|
||||||
# Init cache
|
# Init cache
|
||||||
self.tree_cache = RadixCache(disable=server_args.disable_radix_cache)
|
self.tree_cache = RadixCache(disable=server_args.disable_radix_cache)
|
||||||
|
|||||||
@@ -110,12 +110,12 @@ class InputMetadata:
|
|||||||
self.kv_last_page_len = torch.ones(
|
self.kv_last_page_len = torch.ones(
|
||||||
(self.batch_size,), dtype=torch.int32, device="cuda"
|
(self.batch_size,), dtype=torch.int32, device="cuda"
|
||||||
)
|
)
|
||||||
req_pool_indices_cpu = self.req_pool_indices.cpu().numpy()
|
req_pool_indices_cpu = self.req_pool_indices.cpu().tolist()
|
||||||
seq_lens_cpu = self.seq_lens.cpu().numpy()
|
seq_lens_cpu = self.seq_lens.tolist()
|
||||||
self.kv_indices = torch.cat(
|
self.kv_indices = torch.cat(
|
||||||
[
|
[
|
||||||
self.req_to_token_pool.req_to_token[
|
self.req_to_token_pool.req_to_token[
|
||||||
req_pool_indices_cpu[i]: seq_lens_cpu[i]
|
req_pool_indices_cpu[i], : seq_lens_cpu[i]
|
||||||
]
|
]
|
||||||
for i in range(self.batch_size)
|
for i in range(self.batch_size)
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user