Fix the chat template for llava-v1.6-34b & format code (#177)

2024-02-11 05:50:13 -08:00
parent 50afed4eaa
commit c51020cf0c
23 changed files with 101 additions and 44 deletions
--- a/python/sglang/srt/backend_config.py
+++ b/python/sglang/srt/backend_config.py
@@ -1,6 +1,7 @@
 """
 Backend configurations, may vary with different serving platforms.
 """
+
 from dataclasses import dataclass


--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -366,7 +366,8 @@ def generate_chat_conv(
                    if content.type == "text":
                        real_content += content.text
                    elif content.type == "image_url":
-                        real_content += "<image>"
+                        # NOTE: Only works for llava
+                        real_content += "<image>\n"
                        conv.append_image(content.image_url.url)
                conv.append_message(conv.roles[0], real_content)
        elif msg_role == "assistant":
--- a/python/sglang/srt/managers/router/model_rpc.py
+++ b/python/sglang/srt/managers/router/model_rpc.py
@@ -31,6 +31,7 @@ from sglang.srt.utils import (
    is_multimodal_model,
    set_random_seed,
 )
+from vllm.logger import _default_handler as vllm_default_handler

 logger = logging.getLogger("model_rpc")

@@ -50,6 +51,9 @@ class ModelRpcServer(rpyc.Service):
        self.tp_size = server_args.tp_size
        self.schedule_heuristic = server_args.schedule_heuristic
        self.disable_regex_jump_forward = server_args.disable_regex_jump_forward
+        vllm_default_handler.setLevel(
+            level=getattr(logging, server_args.log_level.upper())
+        )

        # Init model and tokenizer
        self.model_config = ModelConfig(
@@ -83,9 +87,11 @@ class ModelRpcServer(rpyc.Service):
        self.max_num_running_seq = self.max_total_num_token // 2
        self.max_prefill_num_token = max(
            self.model_config.context_len,
-            self.max_total_num_token // 6
-            if server_args.max_prefill_num_token is None
-            else server_args.max_prefill_num_token,
+            (
+                self.max_total_num_token // 6
+                if server_args.max_prefill_num_token is None
+                else server_args.max_prefill_num_token
+            ),
        )
        self.int_token_logit_bias = torch.tensor(
            get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size)
@@ -534,7 +540,7 @@ class ModelRpcServer(rpyc.Service):
                output_skip_special_tokens.append(
                    req.sampling_params.skip_special_tokens
                )
-                
+
                # For the length of input_ids, which will be accumulated during jump-forward.
                # Use the original length of input_ids to calculate the token usage info.
                meta_info = {
--- a/python/sglang/srt/managers/router/model_runner.py
+++ b/python/sglang/srt/managers/router/model_runner.py
@@ -112,7 +112,9 @@ class InputMetadata:
            (self.batch_size,), dtype=torch.int32, device="cuda"
        )

-        workspace_buffer = torch.empty(32 * 1024 * 1024, dtype=torch.int8, device="cuda")
+        workspace_buffer = torch.empty(
+            32 * 1024 * 1024, dtype=torch.int8, device="cuda"
+        )
        if (
            self.forward_mode == ForwardMode.PREFILL
            or self.forward_mode == ForwardMode.EXTEND
@@ -121,7 +123,9 @@ class InputMetadata:
                (self.batch_size + 1,), dtype=torch.int32, device="cuda"
            )
            self.qo_indptr[1:] = torch.cumsum(self.extend_seq_lens, dim=0)
-            self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+            self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                workspace_buffer, "NHD"
+            )
            self.prefill_wrapper.begin_forward(
                self.qo_indptr,
                self.kv_indptr,
@@ -131,7 +135,9 @@ class InputMetadata:
                self.model_runner.model_config.num_key_value_heads // tp_size,
            )
        else:
-            self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")
+            self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                workspace_buffer, "NHD"
+            )
            self.decode_wrapper.begin_forward(
                self.kv_indptr,
                self.kv_indices,
--- a/python/sglang/srt/memory_pool.py
+++ b/python/sglang/srt/memory_pool.py
@@ -1,4 +1,5 @@
 """Memory pool."""
+
 import logging

 import torch
--- a/python/sglang/srt/models/llava.py
+++ b/python/sglang/srt/models/llava.py
@@ -1,4 +1,5 @@
 """Inference-only LLaVa model compatible with HuggingFace weights."""
+
 from typing import List, Optional

 import numpy as np
@@ -269,7 +270,6 @@ class LlavaLlamaForCausalLM(nn.Module):
            raise ValueError(f"Unexpected select feature: {self.select_feature}")

        # load mm_projector
-        # TODO: support TP?
        projector_weights = {
            "model.mm_projector.0": "multi_modal_projector.linear_1",
            "model.mm_projector.2": "multi_modal_projector.linear_2",
--- a/python/sglang/srt/models/mistral.py
+++ b/python/sglang/srt/models/mistral.py
@@ -1,4 +1,5 @@
 """Inference-only Mistral model."""
+
 from sglang.srt.models.llama2 import LlamaForCausalLM


--- a/python/sglang/srt/models/mixtral.py
+++ b/python/sglang/srt/models/mixtral.py
@@ -97,14 +97,16 @@ class MixtralMoE(nn.Module):

        self.experts = nn.ModuleList(
            [
-                MixtralMLP(
-                    self.num_total_experts,
-                    config.hidden_size,
-                    config.intermediate_size,
-                    linear_method=linear_method,
+                (
+                    MixtralMLP(
+                        self.num_total_experts,
+                        config.hidden_size,
+                        config.intermediate_size,
+                        linear_method=linear_method,
+                    )
+                    if idx in self.expert_indicies
+                    else None
                )
-                if idx in self.expert_indicies
-                else None
                for idx in range(self.num_total_experts)
            ]
        )
--- a/python/sglang/srt/models/yivl.py
+++ b/python/sglang/srt/models/yivl.py
@@ -1,4 +1,5 @@
 """Inference-only Yi-VL model."""
+
 import os
 from typing import List, Optional

--- a/python/sglang/srt/sampling_params.py
+++ b/python/sglang/srt/sampling_params.py
@@ -1,4 +1,5 @@
 """Sampling parameters for text generation."""
+
 from typing import List, Optional, Union

 _SAMPLING_EPS = 1e-6
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -1,4 +1,5 @@
 """SRT: SGLang Runtime"""
+
 import asyncio
 import json
 import multiprocessing as mp
@@ -493,7 +494,7 @@ def launch_server(server_args, pipe_finish_writer):

    # Warmup
    try:
-        print("Warmup...", flush=True)
+        # print("Warmup...", flush=True)
        res = requests.post(
            url + "/generate",
            json={
@@ -505,8 +506,8 @@ def launch_server(server_args, pipe_finish_writer):
            },
            timeout=60,
        )
-        print(f"Warmup done. model response: {res.json()['text']}")
-        print("=" * 20, "Server is ready", "=" * 20, flush=True)
+        # print(f"Warmup done. model response: {res.json()['text']}")
+        # print("=" * 20, "Server is ready", "=" * 20, flush=True)
    except requests.exceptions.RequestException as e:
        if pipe_finish_writer is not None:
            pipe_finish_writer.send(str(e))
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -122,7 +122,7 @@ def handle_port_init(
    # first check on server port
    if not check_port(port):
        new_port = alloc_usable_network_port(1, used_list=[port])[0]
-        print(f"Port {port} is not available, using {new_port} instead.")
+        print(f"WARNING: Port {port} is not available. Use {new_port} instead.")
        port = new_port

    # then we check on additional ports
@@ -157,8 +157,6 @@ def get_int_token_logit_bias(tokenizer, vocab_size):
        ss = tokenizer.decode([t_id]).strip()
        if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id):
            logit_bias[t_id] = -1e5
-        # else:
-        #    print(ss, t_id)

    return logit_bias