init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/test.py
+++ b/test.py
@@ -0,0 +1,77 @@
+import os
+
+os.environ["PYTORCH_MLU_ALLOC_CONF"] = "expandable_segments:True"
+
+import soundfile as sf
+import torch
+
+
+from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
+from qwen_omni_utils import process_mm_info
+from PIL import Image
+
+
+MODEL_PATH = "/mnt/models/Qwen3-Omni-30B-A3B-Instruct"
+
+model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    dtype=torch.float16,
+    device_map="auto",
+    # attn_implementation="flash_attention_2",
+)
+
+processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
+
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "./cars.jpg"},
+            {"type": "audio", "audio": "./cough.wav"},
+            {"type": "text", "text": "What can you see and hear? Answer in one short sentence."}
+        ],
+    },
+]
+
+# Set whether to use audio in video
+USE_AUDIO_IN_VIDEO = False
+
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+
+image = Image.open("./cars.jpg")
+image = image.resize((448, 448))
+images[0] = image
+
+inputs = processor(text=text, 
+                   audio=audios, 
+                   images=images, 
+                   videos=videos, 
+                   return_tensors="pt", 
+                   padding=True, 
+                   use_audio_in_video=USE_AUDIO_IN_VIDEO)
+
+# inputs = inputs.to(model.device).to(model.dtype)
+inputs = inputs.to(model.device, dtype=torch.float16)
+
+# Inference: Generation of the output text and audio
+text_ids, audio = model.generate(**inputs, 
+                                 speaker="Ethan", 
+                                 thinker_return_dict_in_generate=True,
+                                 max_new_tokens=10,
+                                 use_audio_in_video=USE_AUDIO_IN_VIDEO)
+
+text = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :],
+                              skip_special_tokens=True,
+                              clean_up_tokenization_spaces=False)
+print(text)
+
+if audio is not None:
+    sf.write(
+        "output.wav",
+        audio.reshape(-1).detach().cpu().numpy(),
+        samplerate=24000,
+    )