enginex-c_series-vl/server.py

import base64
import gc
import io
import os
import time
import uvicorn
from typing import List, Optional, Dict, Any, Tuple

import torch

from PIL import Image
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from transformers import (AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoModel)

import logger
log = logger.get_logger(__file__)

app = FastAPI()

model_type = None
model = None
device = None
tokenizer = None

class GenParams(BaseModel):
    max_new_tokens: int = 128
    temperature: float = 0.0
    top_p: float = 1.0
    do_sample: bool = False

class InferRequest(BaseModel):
    prompt: str
    generation: GenParams = GenParams()
    dtype: str = "auto"  # "auto"|"float16"|"bfloat16"|"float32"
    warmup_runs: int = 1
    measure_token_times: bool = False

@app.on_event("startup")
def load_model():
    log.info("loading model")
    global status, device, model_type, model, tokenizer

    model_path = "/model"
    cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    model_type = cfg.model_type
    log.info(f"model type: {model_type}")

    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)

    model = AutoModel.from_pretrained(model_path, torch_dtype=torch.float32,
                device_map=None, trust_remote_code=True)
    model.to("cuda")
    model.eval()

    status = "success"
    log.info(f"model loaded successfully")

@app.post("/infer")
def infer(req: InferRequest):
    image = Image.open('1.PNG').convert('RGB')
    
    if model_type == "minicpmv":
        text = handle_minicpmv(image, req.prompt, req.generation)
        log.info(f"text={text}")

    result = dict()
    result["output_text"] = text

    return result

def handle_minicpmv(image: Image.Image, prompt: str, gen: GenParams):
    # Prepare msgs in the format expected by model.chat
    msgs = [{"role": "user", "content": prompt}]

    # Call the model's built-in chat method
    response = model.chat(image=image, msgs=msgs, tokenizer=tokenizer,
        sampling=gen.do_sample, temperature=gen.temperature, stream=False)

    return response

if __name__ == '__main__':
    uvicorn.run("server:app", host="0.0.0.0", port=8000, workers=1, access_log=False)
support metax c500 2025-09-19 14:46:59 +08:00			`import base64`
			`import gc`
			`import io`
			`import os`
			`import time`
			`import uvicorn`
			`from typing import List, Optional, Dict, Any, Tuple`

			`import torch`

			`from PIL import Image`
			`from fastapi import FastAPI, HTTPException, Query`
			`from pydantic import BaseModel`
			`from transformers import (AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoModel)`

			`import logger`
			`log = logger.get_logger(__file__)`

			`app = FastAPI()`

			`model_type = None`
			`model = None`
			`device = None`
			`tokenizer = None`

			`class GenParams(BaseModel):`
			`max_new_tokens: int = 128`
			`temperature: float = 0.0`
			`top_p: float = 1.0`
			`do_sample: bool = False`

			`class InferRequest(BaseModel):`
			`prompt: str`
			`generation: GenParams = GenParams()`
			`dtype: str = "auto" # "auto"\|"float16"\|"bfloat16"\|"float32"`
			`warmup_runs: int = 1`
			`measure_token_times: bool = False`

			`@app.on_event("startup")`
			`def load_model():`
			`log.info("loading model")`
			`global status, device, model_type, model, tokenizer`

			`model_path = "/model"`
			`cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)`
			`model_type = cfg.model_type`
			`log.info(f"model type: {model_type}")`

			`tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)`

			`model = AutoModel.from_pretrained(model_path, torch_dtype=torch.float32,`
			`device_map=None, trust_remote_code=True)`
			`model.to("cuda")`
			`model.eval()`

			`status = "success"`
			`log.info(f"model loaded successfully")`

			`@app.post("/infer")`
			`def infer(req: InferRequest):`
			`image = Image.open('1.PNG').convert('RGB')`

			`if model_type == "minicpmv":`
			`text = handle_minicpmv(image, req.prompt, req.generation)`
			`log.info(f"text={text}")`

			`result = dict()`
			`result["output_text"] = text`

			`return result`

			`def handle_minicpmv(image: Image.Image, prompt: str, gen: GenParams):`
			`# Prepare msgs in the format expected by model.chat`
			`msgs = [{"role": "user", "content": prompt}]`

			`# Call the model's built-in chat method`
			`response = model.chat(image=image, msgs=msgs, tokenizer=tokenizer,`
			`sampling=gen.do_sample, temperature=gen.temperature, stream=False)`

			`return response`

			`if __name__ == '__main__':`
			`uvicorn.run("server:app", host="0.0.0.0", port=8000, workers=1, access_log=False)`