release initial code
Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
161
test/srt/model/test_llava_low_api.py
Normal file
161
test/srt/model/test_llava_low_api.py
Normal file
@@ -0,0 +1,161 @@
|
||||
import multiprocessing
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from sglang.srt.hf_transformers_utils import get_processor
|
||||
from sglang.srt.managers.router.infer_batch import ForwardMode
|
||||
from sglang.srt.managers.router.model_runner import InputMetadata, ModelRunner
|
||||
from sglang.srt.model_config import ModelConfig
|
||||
from sglang.srt.utils import load_image
|
||||
|
||||
|
||||
def init_batch_data(model, batch_size, input_len):
|
||||
req_pool_indices = model.req_to_token_pool.alloc(batch_size)
|
||||
seq_lens = torch.full((batch_size,), input_len, dtype=torch.int32, device="cuda")
|
||||
prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
|
||||
position_ids_offsets = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
|
||||
|
||||
out_cache_loc = model.token_to_kv_pool.alloc(batch_size * input_len)
|
||||
for i in range(batch_size):
|
||||
model.req_to_token_pool.req_to_token[i, :input_len] = out_cache_loc[
|
||||
i * input_len : (i + 1) * input_len
|
||||
]
|
||||
|
||||
return (
|
||||
req_pool_indices,
|
||||
seq_lens,
|
||||
prefix_lens,
|
||||
position_ids_offsets,
|
||||
out_cache_loc,
|
||||
)
|
||||
|
||||
|
||||
def prefill(model, tp_rank, params, print_logits):
|
||||
logits, _ = model.forward_extend_multi_modal(
|
||||
*params,
|
||||
False,
|
||||
)
|
||||
prob_out = torch.softmax(logits, dim=-1)
|
||||
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
|
||||
predict_ids = predict_ids.detach().cpu().numpy()
|
||||
|
||||
if print_logits and tp_rank == 0:
|
||||
print("prefill logits", logits, logits.shape)
|
||||
|
||||
return predict_ids
|
||||
|
||||
|
||||
def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
|
||||
(
|
||||
req_pool_indices,
|
||||
seq_lens,
|
||||
prefix_lens,
|
||||
position_ids_offsets,
|
||||
out_cache_loc,
|
||||
) = params
|
||||
|
||||
(
|
||||
out_cache_loc,
|
||||
out_cache_cont_start,
|
||||
out_cache_cont_end,
|
||||
) = model.token_to_kv_pool.alloc_contiguous(batch_size)
|
||||
model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
|
||||
seq_lens.add_(1)
|
||||
logits = model.forward_decode(
|
||||
torch.from_numpy(predict_ids).cuda().reshape(-1),
|
||||
req_pool_indices,
|
||||
seq_lens,
|
||||
None,
|
||||
position_ids_offsets,
|
||||
None,
|
||||
out_cache_cont_start,
|
||||
out_cache_cont_end,
|
||||
)
|
||||
prob_out = torch.softmax(logits, dim=-1)
|
||||
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
|
||||
predict_ids = predict_ids.detach().cpu().numpy()
|
||||
if print_logits and tp_rank == 0:
|
||||
print("decode", step, logits)
|
||||
return predict_ids
|
||||
|
||||
|
||||
def test_generate_worker(
|
||||
model_path,
|
||||
tp_rank,
|
||||
tp_size,
|
||||
):
|
||||
model_config = ModelConfig(path=model_path)
|
||||
model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
|
||||
# print(model.model)
|
||||
|
||||
# Prepare data
|
||||
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:"
|
||||
image_path = "/home/ubuntu/sglang/test/lang/image.png"
|
||||
image = load_image(image_path)
|
||||
|
||||
processor = get_processor("llava-hf/llava-1.5-7b-hf")
|
||||
input_ids = processor.tokenizer.encode(prompt)
|
||||
pixel_values = processor.image_processor(image)["pixel_values"]
|
||||
input_ids, offset = model.model.pad_input_ids(
|
||||
input_ids,
|
||||
[
|
||||
0,
|
||||
],
|
||||
)
|
||||
|
||||
params = init_batch_data(model, 1, len(input_ids))
|
||||
|
||||
# inference
|
||||
output_ids = []
|
||||
prefill_params = (
|
||||
torch.tensor(np.array(input_ids)).cuda(),
|
||||
np.array(pixel_values),
|
||||
[offset],
|
||||
*params,
|
||||
)
|
||||
predict_ids = prefill(model, tp_rank=0, params=prefill_params, print_logits=False)
|
||||
output_ids.append(predict_ids[0][0])
|
||||
for i in range(16):
|
||||
predict_ids = decode(
|
||||
i,
|
||||
model,
|
||||
tp_rank=0,
|
||||
batch_size=1,
|
||||
predict_ids=predict_ids,
|
||||
params=params,
|
||||
print_logits=False,
|
||||
)
|
||||
output_ids.append(predict_ids[0][0])
|
||||
|
||||
# detokenization
|
||||
output = processor.tokenizer.batch_decode(
|
||||
[output_ids], skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
)[0]
|
||||
assert (
|
||||
output
|
||||
== "The image features a man standing on the back of a yellow taxi cab, holding"
|
||||
)
|
||||
|
||||
|
||||
def test_generate(model_path, tp_size):
|
||||
workers = []
|
||||
for tp_rank in range(tp_size):
|
||||
proc = multiprocessing.Process(
|
||||
target=test_generate_worker,
|
||||
args=(
|
||||
model_path,
|
||||
tp_rank,
|
||||
tp_size,
|
||||
),
|
||||
)
|
||||
proc.start()
|
||||
workers.append(proc)
|
||||
|
||||
for proc in workers:
|
||||
proc.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_generate("liuhaotian/llava-v1.5-7b", 1)
|
||||
Reference in New Issue
Block a user