Update deps for mllama4 (#5215)
This commit is contained in:
@@ -38,7 +38,7 @@ runtime_common = [
|
|||||||
"pyzmq>=25.1.2",
|
"pyzmq>=25.1.2",
|
||||||
"soundfile==0.13.1",
|
"soundfile==0.13.1",
|
||||||
"torchao>=0.7.0",
|
"torchao>=0.7.0",
|
||||||
"transformers==4.51.0",
|
"transformers==4.51.1",
|
||||||
"uvicorn",
|
"uvicorn",
|
||||||
"uvloop",
|
"uvloop",
|
||||||
"compressed-tensors",
|
"compressed-tensors",
|
||||||
@@ -50,6 +50,7 @@ srt = [
|
|||||||
"sgl-kernel==0.0.8",
|
"sgl-kernel==0.0.8",
|
||||||
"flashinfer_python==0.2.3",
|
"flashinfer_python==0.2.3",
|
||||||
"torch==2.5.1",
|
"torch==2.5.1",
|
||||||
|
"torchvision==0.20.1",
|
||||||
"cuda-python",
|
"cuda-python",
|
||||||
"outlines>=0.0.44,<=0.1.11",
|
"outlines>=0.0.44,<=0.1.11",
|
||||||
"partial_json_parser",
|
"partial_json_parser",
|
||||||
|
|||||||
@@ -840,7 +840,6 @@ class Scheduler(
|
|||||||
bootstrap_room=recv_req.bootstrap_room,
|
bootstrap_room=recv_req.bootstrap_room,
|
||||||
)
|
)
|
||||||
req.tokenizer = self.tokenizer
|
req.tokenizer = self.tokenizer
|
||||||
req.queue_time_start = time.time()
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
recv_req.session_params is not None
|
recv_req.session_params is not None
|
||||||
@@ -855,7 +854,6 @@ class Scheduler(
|
|||||||
# Create a new request from a previous session
|
# Create a new request from a previous session
|
||||||
session = self.sessions[recv_req.session_params.id]
|
session = self.sessions[recv_req.session_params.id]
|
||||||
req = session.create_req(recv_req, self.tokenizer)
|
req = session.create_req(recv_req, self.tokenizer)
|
||||||
req.queue_time_start = time.time()
|
|
||||||
if isinstance(req.finished_reason, FINISH_ABORT):
|
if isinstance(req.finished_reason, FINISH_ABORT):
|
||||||
self._add_request_to_queue(req)
|
self._add_request_to_queue(req)
|
||||||
return
|
return
|
||||||
@@ -958,6 +956,7 @@ class Scheduler(
|
|||||||
self.disagg_decode_prealloc_queue.add(req)
|
self.disagg_decode_prealloc_queue.add(req)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
req.queue_time_start = time.time()
|
||||||
self.waiting_queue.append(req)
|
self.waiting_queue.append(req)
|
||||||
|
|
||||||
def _extend_requests_to_queue(self, reqs: List[Req], is_retracted: bool = False):
|
def _extend_requests_to_queue(self, reqs: List[Req], is_retracted: bool = False):
|
||||||
|
|||||||
@@ -682,29 +682,30 @@ class TestJanusProServer(TestOpenAIVisionServer):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class TestLlama4Server(TestOpenAIVisionServer):
|
## Skip for ci test
|
||||||
@classmethod
|
# class TestLlama4Server(TestOpenAIVisionServer):
|
||||||
def setUpClass(cls):
|
# @classmethod
|
||||||
cls.model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
# def setUpClass(cls):
|
||||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
# cls.model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||||
cls.api_key = "sk-123456"
|
# cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
cls.process = popen_launch_server(
|
# cls.api_key = "sk-123456"
|
||||||
cls.model,
|
# cls.process = popen_launch_server(
|
||||||
cls.base_url,
|
# cls.model,
|
||||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
# cls.base_url,
|
||||||
other_args=[
|
# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
"--chat-template",
|
# other_args=[
|
||||||
"llama-4",
|
# "--chat-template",
|
||||||
"--mem-fraction-static",
|
# "llama-4",
|
||||||
"0.8",
|
# "--mem-fraction-static",
|
||||||
"--tp-size=8",
|
# "0.8",
|
||||||
"--context-length=8192",
|
# "--tp-size=8",
|
||||||
],
|
# "--context-length=8192",
|
||||||
)
|
# ],
|
||||||
cls.base_url += "/v1"
|
# )
|
||||||
|
# cls.base_url += "/v1"
|
||||||
|
|
||||||
def test_video_chat_completion(self):
|
# def test_video_chat_completion(self):
|
||||||
pass
|
# pass
|
||||||
|
|
||||||
|
|
||||||
class TestGemma3itServer(TestOpenAIVisionServer):
|
class TestGemma3itServer(TestOpenAIVisionServer):
|
||||||
|
|||||||
Reference in New Issue
Block a user