From c31f084c713cb91f0fdb54306f0851aa2780fdf5 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Wed, 7 Aug 2024 19:15:41 +0800 Subject: [PATCH] chore: update vllm to 0.5.4 (#966) --- .github/workflows/e2e-test.yml | 3 +-- .github/workflows/unit-test.yml | 3 +-- README.md | 4 ++-- docker/Dockerfile | 2 +- python/pyproject.toml | 2 +- python/sglang/check_env.py | 1 + test/srt/models/test_causal_models.py | 4 +--- test/srt/run_suite.py | 2 +- test/srt/test_chunked_prefill.py | 2 +- test/srt/test_eval_accuracy.py | 2 +- test/srt/test_openai_server.py | 2 +- test/srt/test_srt_endpoint.py | 2 +- test/srt/test_torch_compile.py | 2 +- test/srt/test_vision_openai_server.py | 2 +- 14 files changed, 15 insertions(+), 18 deletions(-) diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 38651d45b..c8fe8acd9 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -34,8 +34,7 @@ jobs: pip cache purge pip install --upgrade pip pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall - pip install --upgrade transformers + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Benchmark Serving Throughput run: | diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 39e31c325..6f6fe184f 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -34,8 +34,7 @@ jobs: pip cache purge pip install --upgrade pip pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall - pip install --upgrade transformers + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall pip install accelerate - name: Test Frontend Language diff --git a/README.md b/README.md index 1bb6f13d0..01f0a15c2 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ pip install --upgrade pip pip install "sglang[all]" # Install FlashInfer CUDA kernels -pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ +pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ``` ### Method 2: From source @@ -62,7 +62,7 @@ pip install --upgrade pip pip install -e "python[all]" # Install FlashInfer CUDA kernels -pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ +pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ``` ### Method 3: Using docker diff --git a/docker/Dockerfile b/docker/Dockerfile index ee76b084f..9571d71a9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -29,6 +29,6 @@ RUN pip3 --no-cache-dir install --upgrade pip \ && git clone --depth=1 https://github.com/sgl-project/sglang.git \ && cd sglang \ && pip --no-cache-dir install -e "python[all]" \ - && pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ + && pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ ENV DEBIAN_FRONTEND=interactive diff --git a/python/pyproject.toml b/python/pyproject.toml index fa444ea98..67b54e531 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow", "psutil", "pydantic", "python-multipart", "torch", "uvicorn", "uvloop", "zmq", - "vllm==0.5.3.post1", "outlines>=0.0.44"] + "vllm==0.5.4", "outlines>=0.0.44"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index ca71ac788..cc8ba10e0 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -14,6 +14,7 @@ PACKAGE_LIST = [ "sglang", "flashinfer", "triton", + "transformers", "requests", "tqdm", "numpy", diff --git a/test/srt/models/test_causal_models.py b/test/srt/models/test_causal_models.py index 0522816b3..4aeaadb99 100644 --- a/test/srt/models/test_causal_models.py +++ b/test/srt/models/test_causal_models.py @@ -18,9 +18,7 @@ import torch from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner MODELS = [ - # (model_name, tp_size) ("meta-llama/Meta-Llama-3.1-8B-Instruct", 1), - # ("meta-llama/Meta-Llama-3.1-8B-Instruct", 2), ] TORCH_DTYPES = [torch.float16] @@ -51,7 +49,7 @@ class TestCausalModels(unittest.TestCase): hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i]) srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i]) - tolerance = 2e-2 + tolerance = 3e-2 assert torch.all( abs(hf_logprobs - srt_logprobs) < tolerance ), f"prefill logprobs not all close" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 38af8aabd..f993b7e8b 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -20,7 +20,7 @@ if __name__ == "__main__": arg_parser.add_argument( "--timeout-per-file", type=int, - default=1000, + default=2000, help="The time limit for running one file in seconds.", ) arg_parser.add_argument( diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 797c3e5cc..7f274926a 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://localhost:8157" + cls.base_url = "http://127.0.0.1:8157" cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy.py index 0f7ef20b0..b63593626 100644 --- a/test/srt/test_eval_accuracy.py +++ b/test/srt/test_eval_accuracy.py @@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://localhost:8157" + cls.base_url = "http://127.0.0.1:8157" cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index b0df6738f..f86dc0650 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -14,7 +14,7 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://localhost:8157" + cls.base_url = "http://127.0.0.1:8157" cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index c8db402d8..b208dfa13 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -13,7 +13,7 @@ class TestSRTEndpoint(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://localhost:{8157}" + cls.base_url = "http://127.0.0.1:8157" cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index e42be1e8a..fd2c6ebb7 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://localhost:8157" + cls.base_url = "http://127.0.0.1:8157" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] ) diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0f945a5df..982c026db 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = "liuhaotian/llava-v1.6-vicuna-7b" - cls.base_url = "http://localhost:8157" + cls.base_url = "http://127.0.0.1:8157" cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model,