add .isort.cfg (#378)
This commit is contained in:
@@ -3,11 +3,7 @@
|
||||
import re
|
||||
from typing import Callable, List, Optional, Union
|
||||
|
||||
from sglang.backend.anthropic import Anthropic
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.backend.openai import OpenAI
|
||||
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.backend.vertexai import VertexAI
|
||||
from sglang.global_config import global_config
|
||||
from sglang.lang.ir import (
|
||||
SglExpr,
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.lang.chat_template import get_chat_template
|
||||
from sglang.lang.interpreter import StreamExecutor
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Callable, List, Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from sglang.lang.chat_template import get_chat_template
|
||||
from sglang.lang.interpreter import StreamExecutor
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Callable, List, Optional, Union
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
|
||||
from sglang.lang.interpreter import StreamExecutor
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
import json
|
||||
from typing import Callable, List, Optional, Union
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.global_config import global_config
|
||||
from sglang.lang.chat_template import get_chat_template_by_model_path
|
||||
from sglang.lang.interpreter import StreamExecutor
|
||||
from sglang.lang.ir import SglArgument, SglSamplingParams
|
||||
from sglang.utils import encode_image_base64, find_printable_text, http_request
|
||||
from sglang.lang.ir import SglSamplingParams
|
||||
from sglang.utils import find_printable_text, http_request
|
||||
|
||||
|
||||
class RuntimeEndpoint(BaseBackend):
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import os
|
||||
import warnings
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.lang.chat_template import get_chat_template
|
||||
from sglang.lang.interpreter import StreamExecutor
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import Callable, Dict, List, Optional, Tuple
|
||||
from typing import Callable, Dict, List, Tuple
|
||||
|
||||
|
||||
class ChatTemplateStyle(Enum):
|
||||
|
||||
@@ -5,13 +5,7 @@ from typing import List, Union
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program
|
||||
from sglang.lang.ir import (
|
||||
SglArgument,
|
||||
SglConstantText,
|
||||
SglExpr,
|
||||
SglSamplingParams,
|
||||
SglVariable,
|
||||
)
|
||||
from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
|
||||
|
||||
|
||||
def compile_func(function, backend):
|
||||
|
||||
@@ -7,9 +7,10 @@ import threading
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import tqdm
|
||||
|
||||
from sglang.global_config import global_config
|
||||
from sglang.lang.ir import (
|
||||
SglCommitLazy,
|
||||
@@ -17,7 +18,6 @@ from sglang.lang.ir import (
|
||||
SglConstantText,
|
||||
SglExpr,
|
||||
SglExprList,
|
||||
SglFunction,
|
||||
SglGen,
|
||||
SglImage,
|
||||
SglRoleBegin,
|
||||
|
||||
@@ -472,4 +472,4 @@ class SglCommitLazy(SglExpr):
|
||||
super().__init__()
|
||||
|
||||
def __repr__(self):
|
||||
return f"CommitLazy()"
|
||||
return "CommitLazy()"
|
||||
|
||||
@@ -1,20 +1,16 @@
|
||||
"""Tracing a program."""
|
||||
|
||||
import uuid
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from sglang.backend.base_backend import BaseBackend
|
||||
from sglang.global_config import global_config
|
||||
from sglang.lang.interpreter import ProgramState, ProgramStateGroup
|
||||
from sglang.lang.ir import (
|
||||
SglArgument,
|
||||
SglCommitLazy,
|
||||
SglConcateAndAppend,
|
||||
SglConstantText,
|
||||
SglExpr,
|
||||
SglExprList,
|
||||
SglFork,
|
||||
SglFunction,
|
||||
SglGen,
|
||||
SglGetForkItem,
|
||||
SglRoleBegin,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import interegular
|
||||
|
||||
from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm
|
||||
from sglang.srt.constrained.base_cache import BaseCache
|
||||
|
||||
|
||||
@@ -3,10 +3,9 @@
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from sglang.srt.utils import is_multimodal_model
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoProcessor,
|
||||
@@ -15,6 +14,8 @@ from transformers import (
|
||||
PreTrainedTokenizerFast,
|
||||
)
|
||||
|
||||
from sglang.srt.utils import is_multimodal_model
|
||||
|
||||
|
||||
def download_from_hf(model_path: str):
|
||||
if os.path.exists(model_path):
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from sglang.srt.utils import wrap_kernel_launcher
|
||||
|
||||
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
|
||||
from sglang.srt.utils import wrap_kernel_launcher
|
||||
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
import torch
|
||||
from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata
|
||||
from torch import nn
|
||||
from vllm.model_executor.parallel_utils.communication_op import (
|
||||
get_tensor_model_parallel_world_size,
|
||||
tensor_model_parallel_all_gather,
|
||||
)
|
||||
|
||||
from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata
|
||||
|
||||
|
||||
class LogitsProcessor(nn.Module):
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
|
||||
from sglang.srt.layers.extend_attention import extend_attention_fwd
|
||||
from sglang.srt.layers.token_attention import token_attention_fwd
|
||||
from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata
|
||||
from torch import nn
|
||||
|
||||
|
||||
class RadixAttention(nn.Module):
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from sglang.srt.managers.router.model_runner import global_server_args_dict
|
||||
from sglang.srt.utils import wrap_kernel_launcher
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import asyncio
|
||||
import uvloop
|
||||
import zmq
|
||||
import zmq.asyncio
|
||||
|
||||
from sglang.srt.hf_transformers_utils import get_tokenizer
|
||||
from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
@@ -83,7 +84,7 @@ def start_detokenizer_process(
|
||||
):
|
||||
try:
|
||||
manager = DetokenizerManager(server_args, port_args)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pipe_writer.send(get_exception_traceback())
|
||||
raise
|
||||
pipe_writer.send("init ok")
|
||||
|
||||
@@ -4,6 +4,7 @@ from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from sglang.srt.managers.router.radix_cache import RadixCache
|
||||
from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import logging
|
||||
import uvloop
|
||||
import zmq
|
||||
import zmq.asyncio
|
||||
|
||||
from sglang.srt.backend_config import GLOBAL_BACKEND_CONFIG
|
||||
from sglang.srt.managers.router.model_rpc import ModelRpcClient
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
|
||||
@@ -10,6 +10,8 @@ import rpyc
|
||||
import torch
|
||||
from rpyc.utils.classic import obtain
|
||||
from rpyc.utils.server import ThreadedServer
|
||||
from vllm.logger import _default_handler as vllm_default_handler
|
||||
|
||||
from sglang.srt.constrained.fsm_cache import FSMCache
|
||||
from sglang.srt.constrained.jump_forward import JumpForwardCache
|
||||
from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer
|
||||
@@ -30,7 +32,6 @@ from sglang.srt.utils import (
|
||||
is_multimodal_model,
|
||||
set_random_seed,
|
||||
)
|
||||
from vllm.logger import _default_handler as vllm_default_handler
|
||||
|
||||
logger = logging.getLogger("model_rpc")
|
||||
|
||||
|
||||
@@ -9,16 +9,17 @@ from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from sglang.srt.managers.router.infer_batch import Batch, ForwardMode
|
||||
from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
|
||||
from sglang.srt.utils import is_multimodal_model
|
||||
from sglang.utils import get_available_gpu_memory
|
||||
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
|
||||
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
|
||||
from vllm.model_executor.model_loader import _set_default_torch_dtype
|
||||
from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel
|
||||
|
||||
from sglang.srt.managers.router.infer_batch import Batch, ForwardMode
|
||||
from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
|
||||
from sglang.srt.utils import is_multimodal_model
|
||||
from sglang.utils import get_available_gpu_memory
|
||||
|
||||
QUANTIZATION_CONFIG_MAPPING = {
|
||||
"awq": AWQConfig,
|
||||
"gptq": GPTQConfig,
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import heapq
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ import transformers
|
||||
import uvloop
|
||||
import zmq
|
||||
import zmq.asyncio
|
||||
|
||||
from sglang.srt.hf_transformers_utils import (
|
||||
get_config,
|
||||
get_context_length,
|
||||
|
||||
@@ -20,13 +20,10 @@
|
||||
|
||||
# This file is based on the LLama model definition file in transformers
|
||||
"""PyTorch Cohere model."""
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from torch.nn.parameter import Parameter
|
||||
from transformers import PretrainedConfig
|
||||
@@ -49,6 +46,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
|
||||
@torch.compile
|
||||
def layer_norm_func(hidden_states, weight, variance_epsilon):
|
||||
|
||||
@@ -5,10 +5,6 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from sglang.srt.models.dbrx_config import DbrxConfig
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from vllm.model_executor.layers.linear import (
|
||||
LinearMethodBase,
|
||||
@@ -35,6 +31,11 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from sglang.srt.models.dbrx_config import DbrxConfig
|
||||
|
||||
|
||||
class DbrxRouter(nn.Module):
|
||||
"""A Router implementation for DBRX that returns logits for each expert
|
||||
|
||||
@@ -4,9 +4,6 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import LoRAConfig
|
||||
@@ -28,6 +25,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
|
||||
class GemmaMLP(nn.Module):
|
||||
def __init__(
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
# Adapted from
|
||||
# https://github.com/vllm-project/vllm/blob/671af2b1c0b3ed6d856d37c21a561cc429a10701/vllm/model_executor/models/llama.py#L1
|
||||
"""Inference-only LLaMA model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from transformers import LlamaConfig
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
@@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
|
||||
class LlamaMLP(nn.Module):
|
||||
def __init__(
|
||||
|
||||
@@ -4,6 +4,15 @@ from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import CLIPVisionModel, LlavaConfig
|
||||
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
||||
from vllm.model_executor.layers.linear import LinearMethodBase
|
||||
from vllm.model_executor.weight_utils import (
|
||||
default_weight_loader,
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.managers.router.infer_batch import ForwardMode
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from sglang.srt.mm_utils import (
|
||||
@@ -12,14 +21,6 @@ from sglang.srt.mm_utils import (
|
||||
unpad_image_shape,
|
||||
)
|
||||
from sglang.srt.models.llama2 import LlamaForCausalLM
|
||||
from torch import nn
|
||||
from transformers import CLIPVisionModel, LlamaConfig, LlavaConfig
|
||||
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
||||
from vllm.model_executor.layers.linear import LinearMethodBase
|
||||
from vllm.model_executor.weight_utils import (
|
||||
default_weight_loader,
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
|
||||
class LlavaLlamaForCausalLM(nn.Module):
|
||||
|
||||
@@ -1,14 +1,11 @@
|
||||
# Adapted from
|
||||
# https://github.com/vllm-project/vllm/blob/d0215a58e78572d91dadafe9d832a2db89b09a13/vllm/model_executor/models/mixtral.py#L1
|
||||
"""Inference-only Mixtral model."""
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from transformers import MixtralConfig
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
@@ -35,6 +32,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
|
||||
class MixtralMLP(nn.Module):
|
||||
def __init__(
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import torch
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
@@ -27,6 +24,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
|
||||
class QWenMLP(nn.Module):
|
||||
def __init__(
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
# Adapted from llama2.py
|
||||
# Modify details for the adaptation of Qwen2 model.
|
||||
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
@@ -29,6 +26,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
Qwen2Config = None
|
||||
|
||||
|
||||
|
||||
@@ -5,9 +5,6 @@ model compatible with HuggingFace weights."""
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
@@ -30,6 +27,10 @@ from vllm.model_executor.weight_utils import (
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata
|
||||
|
||||
|
||||
class StablelmMLP(nn.Module):
|
||||
def __init__(
|
||||
|
||||
@@ -1,21 +1,20 @@
|
||||
"""Inference-only Yi-VL model."""
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from sglang.srt.models.llava import (
|
||||
LlavaLlamaForCausalLM,
|
||||
clip_vision_embed_forward,
|
||||
monkey_path_clip_vision_embed_forward,
|
||||
)
|
||||
from transformers import CLIPVisionModel, LlavaConfig
|
||||
from vllm.model_executor.weight_utils import (
|
||||
default_weight_loader,
|
||||
hf_model_weights_iterator,
|
||||
)
|
||||
|
||||
from sglang.srt.models.llava import (
|
||||
LlavaLlamaForCausalLM,
|
||||
monkey_path_clip_vision_embed_forward,
|
||||
)
|
||||
|
||||
|
||||
class YiVLForCausalLM(LlavaLlamaForCausalLM):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
@@ -10,9 +10,6 @@ import threading
|
||||
import time
|
||||
from typing import List, Optional, Union
|
||||
|
||||
# Fix a Python bug
|
||||
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
||||
|
||||
import aiohttp
|
||||
import psutil
|
||||
import pydantic
|
||||
@@ -22,6 +19,9 @@ import uvloop
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import Response, StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.responses import JSONResponse
|
||||
|
||||
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.srt.constrained import disable_cache
|
||||
from sglang.srt.conversation import (
|
||||
@@ -54,8 +54,9 @@ from sglang.srt.managers.router.manager import start_router_process
|
||||
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
||||
from sglang.srt.server_args import PortArgs, ServerArgs
|
||||
from sglang.srt.utils import enable_show_time_cost, handle_port_init
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.responses import JSONResponse
|
||||
|
||||
# Fix a Python bug
|
||||
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
||||
|
||||
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
||||
|
||||
@@ -618,7 +619,7 @@ def launch_server(server_args, pipe_finish_writer):
|
||||
try:
|
||||
requests.get(url + "/get_model_info", timeout=5, headers=headers)
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
else:
|
||||
if pipe_finish_writer is not None:
|
||||
|
||||
@@ -157,7 +157,6 @@ def get_exception_traceback():
|
||||
|
||||
|
||||
def get_int_token_logit_bias(tokenizer, vocab_size):
|
||||
from transformers import LlamaTokenizer, LlamaTokenizerFast
|
||||
|
||||
# a bug when model's vocab size > tokenizer.vocab_size
|
||||
vocab_size = tokenizer.vocab_size
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from sglang.backend.openai import OpenAI
|
||||
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
||||
from sglang.global_config import global_config
|
||||
|
||||
Reference in New Issue
Block a user