example: add vlm to token in & out example (#3941)

Co-authored-by: zhaochenyang20 <zhaochen20@outlook.com>
This commit is contained in:
Mick
2025-03-05 14:18:26 +08:00
committed by GitHub
parent e074d84e5b
commit 583d6af71b
9 changed files with 154 additions and 29 deletions

View File

@@ -40,7 +40,7 @@ class ModelConfig:
trust_remote_code: bool = True,
revision: Optional[str] = None,
context_length: Optional[int] = None,
model_override_args: Optional[dict] = None,
model_override_args: Optional[str] = None,
is_embedding: Optional[bool] = None,
dtype: str = "auto",
quantization: Optional[str] = None,

View File

@@ -42,7 +42,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
from sglang.srt.managers.io_struct import (
AbortReq,
BatchEmbeddingOut,
BatchMultimodalDecodeReq,
BatchTokenIDOut,
CloseSessionReqInput,
FlushCacheReq,
@@ -104,7 +103,6 @@ from sglang.srt.utils import (
crash_on_warnings,
get_bool_env_var,
get_zmq_socket,
kill_itself_when_parent_died,
pyspy_dump_schedulers,
set_gpu_proc_affinity,
set_random_seed,
@@ -1199,7 +1197,6 @@ class Scheduler:
self.spec_num_total_forward_ct += batch.batch_size()
self.num_generated_tokens += num_accepted_tokens
batch.output_ids = next_token_ids
# These 2 values are needed for processing the output, but the values can be
# modified by overlap schedule. So we have to copy them here so that
# we can use the correct values in output processing.
@@ -1480,7 +1477,6 @@ class Scheduler:
batch.next_batch_sampling_info.update_regex_vocab_mask()
self.current_stream.synchronize()
batch.next_batch_sampling_info.sampling_info_done.set()
self.stream_output(batch.reqs, batch.return_logprob)
self.token_to_kv_pool.free_group_end()
@@ -1580,11 +1576,11 @@ class Scheduler:
if req.top_logprobs_num > 0:
req.input_top_logprobs_val = [None]
req.input_top_logprobs_idx = [None]
assert len(req.temp_input_token_ids_logprobs_val) == len(
req.temp_input_token_ids_logprobs_idx
)
for val, idx in zip(
req.temp_input_top_logprobs_val,
req.temp_input_top_logprobs_idx,
strict=True,
req.temp_input_top_logprobs_val, req.temp_input_top_logprobs_idx
):
req.input_top_logprobs_val.extend(val)
req.input_top_logprobs_idx.extend(idx)
@@ -1779,7 +1775,6 @@ class Scheduler:
if rids:
if self.model_config.is_multimodal_gen:
raise NotImplementedError()
self.send_to_detokenizer.send_pyobj(
BatchTokenIDOut(
rids,

View File

@@ -11,7 +11,7 @@ import math
import os
from abc import ABC, abstractmethod
from contextlib import contextmanager
from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Type, cast
from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
import gguf
import huggingface_hub
@@ -19,7 +19,7 @@ import numpy as np
import torch
from huggingface_hub import HfApi, hf_hub_download
from torch import nn
from transformers import AutoModelForCausalLM, PretrainedConfig
from transformers import AutoModelForCausalLM
from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
from sglang.srt.configs.device_config import DeviceConfig
@@ -197,7 +197,7 @@ class DefaultModelLoader(BaseModelLoader):
Returns the path to the downloaded model, or None if the model is not
downloaded from ModelScope."""
if "SGLANG_USE_MODELSCOPE" in os.environ:
if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True":
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.

View File

@@ -43,10 +43,15 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
def is_in_ci():
"""Return whether it is in CI runner."""