[Refactor] simplify multimodal data processing (#8107)
Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
This commit is contained in:
@@ -126,14 +126,14 @@
|
||||
" images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
|
||||
")\n",
|
||||
"input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
|
||||
"precomputed_features = vision(\n",
|
||||
"precomputed_embeddings = vision(\n",
|
||||
" processed_prompt[\"pixel_values\"].cuda(), processed_prompt[\"image_grid_thw\"].cuda()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"mm_item = dict(\n",
|
||||
" modality=\"IMAGE\",\n",
|
||||
" image_grid_thw=processed_prompt[\"image_grid_thw\"],\n",
|
||||
" precomputed_features=precomputed_features,\n",
|
||||
" precomputed_embeddings=precomputed_embeddings,\n",
|
||||
")\n",
|
||||
"out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
|
||||
"print(out[\"text\"])"
|
||||
|
||||
Reference in New Issue
Block a user