import os os.environ["PYTORCH_MLU_ALLOC_CONF"] = "expandable_segments:True" import soundfile as sf import torch from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor from qwen_omni_utils import process_mm_info from PIL import Image MODEL_PATH = "/mnt/models/Qwen3-Omni-30B-A3B-Instruct" model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( MODEL_PATH, trust_remote_code=True, dtype=torch.float16, device_map="auto", # attn_implementation="flash_attention_2", ) processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH) conversation = [ { "role": "user", "content": [ {"type": "image", "image": "./cars.jpg"}, {"type": "audio", "audio": "./cough.wav"}, {"type": "text", "text": "What can you see and hear? Answer in one short sentence."} ], }, ] # Set whether to use audio in video USE_AUDIO_IN_VIDEO = False # Preparation for inference text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) image = Image.open("./cars.jpg") image = image.resize((448, 448)) images[0] = image inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO) # inputs = inputs.to(model.device).to(model.dtype) inputs = inputs.to(model.device, dtype=torch.float16) # Inference: Generation of the output text and audio text_ids, audio = model.generate(**inputs, speaker="Ethan", thinker_return_dict_in_generate=True, max_new_tokens=10, use_audio_in_video=USE_AUDIO_IN_VIDEO) text = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False) print(text) if audio is not None: sf.write( "output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000, )