From c31f084c713cb91f0fdb54306f0851aa2780fdf5 Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 7 Aug 2024 19:15:41 +0800
Subject: [PATCH] chore: update vllm to 0.5.4 (#966)

---
 .github/workflows/e2e-test.yml        | 3 +--
 .github/workflows/unit-test.yml       | 3 +--
 README.md                             | 4 ++--
 docker/Dockerfile                     | 2 +-
 python/pyproject.toml                 | 2 +-
 python/sglang/check_env.py            | 1 +
 test/srt/models/test_causal_models.py | 4 +---
 test/srt/run_suite.py                 | 2 +-
 test/srt/test_chunked_prefill.py      | 2 +-
 test/srt/test_eval_accuracy.py        | 2 +-
 test/srt/test_openai_server.py        | 2 +-
 test/srt/test_srt_endpoint.py         | 2 +-
 test/srt/test_torch_compile.py        | 2 +-
 test/srt/test_vision_openai_server.py | 2 +-
 14 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
index 38651d45b..c8fe8acd9 100644
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -34,8 +34,7 @@ jobs:
         pip cache purge
         pip install --upgrade pip
         pip install -e "python[all]"
-        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
-        pip install --upgrade transformers
+        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
     - name: Benchmark Serving Throughput
       run: |
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 39e31c325..6f6fe184f 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -34,8 +34,7 @@ jobs:
         pip cache purge
         pip install --upgrade pip
         pip install -e "python[all]"
-        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
-        pip install --upgrade transformers
+        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
         pip install accelerate
 
     - name: Test Frontend Language
diff --git a/README.md b/README.md
index 1bb6f13d0..01f0a15c2 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ pip install --upgrade pip
 pip install "sglang[all]"
 
 # Install FlashInfer CUDA kernels
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 
 ### Method 2: From source
@@ -62,7 +62,7 @@ pip install --upgrade pip
 pip install -e "python[all]"
 
 # Install FlashInfer CUDA kernels
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
 
 ### Method 3: Using docker
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ee76b084f..9571d71a9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -29,6 +29,6 @@ RUN pip3 --no-cache-dir install --upgrade pip \
     && git clone --depth=1 https://github.com/sgl-project/sglang.git \
     && cd sglang \
     && pip --no-cache-dir install -e "python[all]" \
-    && pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+    && pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 
 ENV DEBIAN_FRONTEND=interactive
diff --git a/python/pyproject.toml b/python/pyproject.toml
index fa444ea98..67b54e531 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
 srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
        "packaging", "pillow", "psutil", "pydantic", "python-multipart",
        "torch", "uvicorn", "uvloop", "zmq",
-       "vllm==0.5.3.post1", "outlines>=0.0.44"]
+       "vllm==0.5.4", "outlines>=0.0.44"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py
index ca71ac788..cc8ba10e0 100644
--- a/python/sglang/check_env.py
+++ b/python/sglang/check_env.py
@@ -14,6 +14,7 @@ PACKAGE_LIST = [
     "sglang",
     "flashinfer",
     "triton",
+    "transformers",
     "requests",
     "tqdm",
     "numpy",
diff --git a/test/srt/models/test_causal_models.py b/test/srt/models/test_causal_models.py
index 0522816b3..4aeaadb99 100644
--- a/test/srt/models/test_causal_models.py
+++ b/test/srt/models/test_causal_models.py
@@ -18,9 +18,7 @@ import torch
 from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
 
 MODELS = [
-    # (model_name, tp_size)
     ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1),
-    # ("meta-llama/Meta-Llama-3.1-8B-Instruct", 2),
 ]
 TORCH_DTYPES = [torch.float16]
 
@@ -51,7 +49,7 @@ class TestCausalModels(unittest.TestCase):
             hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
             srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
 
-            tolerance = 2e-2
+            tolerance = 3e-2
             assert torch.all(
                 abs(hf_logprobs - srt_logprobs) < tolerance
             ), f"prefill logprobs not all close"
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 38af8aabd..f993b7e8b 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -20,7 +20,7 @@ if __name__ == "__main__":
     arg_parser.add_argument(
         "--timeout-per-file",
         type=int,
-        default=1000,
+        default=2000,
         help="The time limit for running one file in seconds.",
     )
     arg_parser.add_argument(
diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py
index 797c3e5cc..7f274926a 100644
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = f"http://localhost:8157"
+        cls.base_url = "http://127.0.0.1:8157"
         cls.process = popen_launch_server(
             cls.model,
             cls.base_url,
diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy.py
index 0f7ef20b0..b63593626 100644
--- a/test/srt/test_eval_accuracy.py
+++ b/test/srt/test_eval_accuracy.py
@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = f"http://localhost:8157"
+        cls.base_url = "http://127.0.0.1:8157"
         cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
 
     @classmethod
diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py
index b0df6738f..f86dc0650 100644
--- a/test/srt/test_openai_server.py
+++ b/test/srt/test_openai_server.py
@@ -14,7 +14,7 @@ class TestOpenAIServer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = f"http://localhost:8157"
+        cls.base_url = "http://127.0.0.1:8157"
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, api_key=cls.api_key
diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
index c8db402d8..b208dfa13 100644
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -13,7 +13,7 @@ class TestSRTEndpoint(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = f"http://localhost:{8157}"
+        cls.base_url = "http://127.0.0.1:8157"
         cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
 
     @classmethod
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index e42be1e8a..fd2c6ebb7 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = f"http://localhost:8157"
+        cls.base_url = "http://127.0.0.1:8157"
         cls.process = popen_launch_server(
             cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
         )
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 0f945a5df..982c026db 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
-        cls.base_url = "http://localhost:8157"
+        cls.base_url = "http://127.0.0.1:8157"
         cls.api_key = "sk-123456"
         cls.process = popen_launch_server(
             cls.model,