chore: update vllm to 0.5.4 (#966)
This commit is contained in:
3
.github/workflows/e2e-test.yml
vendored
3
.github/workflows/e2e-test.yml
vendored
@@ -34,8 +34,7 @@ jobs:
|
|||||||
pip cache purge
|
pip cache purge
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -e "python[all]"
|
pip install -e "python[all]"
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||||
pip install --upgrade transformers
|
|
||||||
|
|
||||||
- name: Benchmark Serving Throughput
|
- name: Benchmark Serving Throughput
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
3
.github/workflows/unit-test.yml
vendored
3
.github/workflows/unit-test.yml
vendored
@@ -34,8 +34,7 @@ jobs:
|
|||||||
pip cache purge
|
pip cache purge
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -e "python[all]"
|
pip install -e "python[all]"
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||||
pip install --upgrade transformers
|
|
||||||
pip install accelerate
|
pip install accelerate
|
||||||
|
|
||||||
- name: Test Frontend Language
|
- name: Test Frontend Language
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ pip install --upgrade pip
|
|||||||
pip install "sglang[all]"
|
pip install "sglang[all]"
|
||||||
|
|
||||||
# Install FlashInfer CUDA kernels
|
# Install FlashInfer CUDA kernels
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
||||||
```
|
```
|
||||||
|
|
||||||
### Method 2: From source
|
### Method 2: From source
|
||||||
@@ -62,7 +62,7 @@ pip install --upgrade pip
|
|||||||
pip install -e "python[all]"
|
pip install -e "python[all]"
|
||||||
|
|
||||||
# Install FlashInfer CUDA kernels
|
# Install FlashInfer CUDA kernels
|
||||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
||||||
```
|
```
|
||||||
|
|
||||||
### Method 3: Using docker
|
### Method 3: Using docker
|
||||||
|
|||||||
@@ -29,6 +29,6 @@ RUN pip3 --no-cache-dir install --upgrade pip \
|
|||||||
&& git clone --depth=1 https://github.com/sgl-project/sglang.git \
|
&& git clone --depth=1 https://github.com/sgl-project/sglang.git \
|
||||||
&& cd sglang \
|
&& cd sglang \
|
||||||
&& pip --no-cache-dir install -e "python[all]" \
|
&& pip --no-cache-dir install -e "python[all]" \
|
||||||
&& pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
&& pip3 --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=interactive
|
ENV DEBIAN_FRONTEND=interactive
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ dependencies = [
|
|||||||
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
||||||
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
||||||
"torch", "uvicorn", "uvloop", "zmq",
|
"torch", "uvicorn", "uvloop", "zmq",
|
||||||
"vllm==0.5.3.post1", "outlines>=0.0.44"]
|
"vllm==0.5.4", "outlines>=0.0.44"]
|
||||||
openai = ["openai>=1.0", "tiktoken"]
|
openai = ["openai>=1.0", "tiktoken"]
|
||||||
anthropic = ["anthropic>=0.20.0"]
|
anthropic = ["anthropic>=0.20.0"]
|
||||||
litellm = ["litellm>=1.0.0"]
|
litellm = ["litellm>=1.0.0"]
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ PACKAGE_LIST = [
|
|||||||
"sglang",
|
"sglang",
|
||||||
"flashinfer",
|
"flashinfer",
|
||||||
"triton",
|
"triton",
|
||||||
|
"transformers",
|
||||||
"requests",
|
"requests",
|
||||||
"tqdm",
|
"tqdm",
|
||||||
"numpy",
|
"numpy",
|
||||||
|
|||||||
@@ -18,9 +18,7 @@ import torch
|
|||||||
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
|
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
# (model_name, tp_size)
|
|
||||||
("meta-llama/Meta-Llama-3.1-8B-Instruct", 1),
|
("meta-llama/Meta-Llama-3.1-8B-Instruct", 1),
|
||||||
# ("meta-llama/Meta-Llama-3.1-8B-Instruct", 2),
|
|
||||||
]
|
]
|
||||||
TORCH_DTYPES = [torch.float16]
|
TORCH_DTYPES = [torch.float16]
|
||||||
|
|
||||||
@@ -51,7 +49,7 @@ class TestCausalModels(unittest.TestCase):
|
|||||||
hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
|
hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
|
||||||
srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
|
srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
|
||||||
|
|
||||||
tolerance = 2e-2
|
tolerance = 3e-2
|
||||||
assert torch.all(
|
assert torch.all(
|
||||||
abs(hf_logprobs - srt_logprobs) < tolerance
|
abs(hf_logprobs - srt_logprobs) < tolerance
|
||||||
), f"prefill logprobs not all close"
|
), f"prefill logprobs not all close"
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ if __name__ == "__main__":
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
"--timeout-per-file",
|
"--timeout-per-file",
|
||||||
type=int,
|
type=int,
|
||||||
default=1000,
|
default=2000,
|
||||||
help="The time limit for running one file in seconds.",
|
help="The time limit for running one file in seconds.",
|
||||||
)
|
)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
cls.base_url = f"http://localhost:8157"
|
cls.base_url = "http://127.0.0.1:8157"
|
||||||
cls.process = popen_launch_server(
|
cls.process = popen_launch_server(
|
||||||
cls.model,
|
cls.model,
|
||||||
cls.base_url,
|
cls.base_url,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
cls.base_url = f"http://localhost:8157"
|
cls.base_url = "http://127.0.0.1:8157"
|
||||||
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
|
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ class TestOpenAIServer(unittest.TestCase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
cls.base_url = f"http://localhost:8157"
|
cls.base_url = "http://127.0.0.1:8157"
|
||||||
cls.api_key = "sk-123456"
|
cls.api_key = "sk-123456"
|
||||||
cls.process = popen_launch_server(
|
cls.process = popen_launch_server(
|
||||||
cls.model, cls.base_url, timeout=300, api_key=cls.api_key
|
cls.model, cls.base_url, timeout=300, api_key=cls.api_key
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class TestSRTEndpoint(unittest.TestCase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
cls.base_url = f"http://localhost:{8157}"
|
cls.base_url = "http://127.0.0.1:8157"
|
||||||
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
|
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ class TestAccuracy(unittest.TestCase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
cls.base_url = f"http://localhost:8157"
|
cls.base_url = "http://127.0.0.1:8157"
|
||||||
cls.process = popen_launch_server(
|
cls.process = popen_launch_server(
|
||||||
cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
|
cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
|
cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
|
||||||
cls.base_url = "http://localhost:8157"
|
cls.base_url = "http://127.0.0.1:8157"
|
||||||
cls.api_key = "sk-123456"
|
cls.api_key = "sk-123456"
|
||||||
cls.process = popen_launch_server(
|
cls.process = popen_launch_server(
|
||||||
cls.model,
|
cls.model,
|
||||||
|
|||||||
Reference in New Issue
Block a user