From 444a02441a50f87d86c406901b3726b323a3fc0f Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Tue, 23 Jul 2024 13:49:34 -0700 Subject: [PATCH] Update vllm version to support llama3.1 (#705) --- python/pyproject.toml | 2 +- python/sglang/srt/hf_transformers_utils.py | 2 ++ python/sglang/srt/models/llama2.py | 9 +-------- python/sglang/srt/server.py | 1 + 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 47fd3bd0b..c305200a7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ [project.optional-dependencies] srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow", - "psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"] + "psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.3.post1", "outlines>=0.0.44"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index 850f3ffc2..1be7c32a1 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -73,6 +73,8 @@ def get_context_length(config): rope_scaling = getattr(config, "rope_scaling", None) if rope_scaling: rope_scaling_factor = config.rope_scaling["factor"] + if config.rope_scaling["rope_type"] == "llama3": + rope_scaling_factor = 1 else: rope_scaling_factor = 1 diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index fb808abed..40bc5f8f4 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -5,14 +5,10 @@ from typing import Any, Dict, Iterable, Optional, Tuple import torch -import tqdm from torch import nn from transformers import LlamaConfig from vllm.config import CacheConfig -from vllm.distributed import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) +from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.base_config import QuantizationConfig @@ -375,9 +371,6 @@ class LlamaForCausalLM(nn.Module): weight_loader(param, loaded_weight) if name is None or loaded_weight is None: - if get_tensor_model_parallel_rank() == 0: - weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5)) - for name, loaded_weight in weights: load_weights_per_param(name, loaded_weight) else: diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index b3e0aea58..e93727cc6 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -222,6 +222,7 @@ def launch_server( detokenizer_port=ports[2], nccl_ports=ports[3:], ) + logger.info(f"{server_args=}") # Handle multi-node tensor parallelism if server_args.nnodes > 1: