Update README.md
This commit is contained in:
78
README.md
78
README.md
@@ -9,4 +9,80 @@ This repository provides access to the **FAST-3B** model, which is built on the
|
|||||||
```
|
```
|
||||||
## Decoding Paramater
|
## Decoding Paramater
|
||||||
|
|
||||||
We recommend setting `temperature=0` to reproduce the reported performance. Note that performance may vary depending on the version of vLLM being used.
|
We recommend setting `temperature=0` to reproduce the reported performance. Note that performance may vary depending on the version of vLLM being used.
|
||||||
|
|
||||||
|
## Inference Guide
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
Install the required dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install vllm==0.8.1
|
||||||
|
```
|
||||||
|
|
||||||
|
### Starting the Server
|
||||||
|
|
||||||
|
Start the vLLM server with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0 vllm serve /PATH/TO/FAST \
|
||||||
|
--max-model-len 12800 \
|
||||||
|
--dtype auto \
|
||||||
|
--gpu_memory_utilization 0.75 \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-num-seqs 12 \
|
||||||
|
--mm-processor-kwargs '{"max_pixels":1002112}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace `/PATH/TO/FAST` with the actual path to your model.
|
||||||
|
|
||||||
|
### Simple Demo
|
||||||
|
|
||||||
|
```python
|
||||||
|
import base64
|
||||||
|
|
||||||
|
# Define the system prompt
|
||||||
|
SYSTEM_PROMPT = """You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \\boxed{}."""
|
||||||
|
|
||||||
|
def simple_inference(image_path, query, max_tokens=2048, temperature=0):
|
||||||
|
"""
|
||||||
|
Perform a simple inference with an image and a text query.
|
||||||
|
Args:
|
||||||
|
image_path (str): Path to the input image file.
|
||||||
|
query (str): Text query for the model.
|
||||||
|
max_tokens (int): Maximum number of tokens in the response.
|
||||||
|
temperature (float): Sampling temperature for the model.
|
||||||
|
Returns:
|
||||||
|
str: The model's response.
|
||||||
|
"""
|
||||||
|
# Load the image as a base64 string
|
||||||
|
with open(image_path, 'rb') as file:
|
||||||
|
image_base64 = "data:image/jpeg;base64," + base64.b64encode(file.read()).decode('utf-8')
|
||||||
|
|
||||||
|
# Prepare the chat request
|
||||||
|
request = {
|
||||||
|
"model": "/PATH/TO/FAST", # Replace with your model path
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT}, # Add the system prompt
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": query},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_base64}},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
}
|
||||||
|
# Call the chat API
|
||||||
|
try:
|
||||||
|
from openai import OpenAI
|
||||||
|
client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
|
||||||
|
chat_response = client.chat.completions.create(**request)
|
||||||
|
return chat_response.choices[0].message.content
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during inference: {e}")
|
||||||
|
return None
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user