import os

os.environ["PYTORCH_MLU_ALLOC_CONF"] = "expandable_segments:True"

import soundfile as sf
import torch


from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info
from PIL import Image


MODEL_PATH = "/mnt/models/Qwen3-Omni-30B-A3B-Instruct"

model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    dtype=torch.float16,
    device_map="auto",
    # attn_implementation="flash_attention_2",
)

processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)


conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "./cars.jpg"},
            {"type": "audio", "audio": "./cough.wav"},
            {"type": "text", "text": "What can you see and hear? Answer in one short sentence."}
        ],
    },
]

# Set whether to use audio in video
USE_AUDIO_IN_VIDEO = False

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

image = Image.open("./cars.jpg")
image = image.resize((448, 448))
images[0] = image

inputs = processor(text=text, 
                   audio=audios, 
                   images=images, 
                   videos=videos, 
                   return_tensors="pt", 
                   padding=True, 
                   use_audio_in_video=USE_AUDIO_IN_VIDEO)

# inputs = inputs.to(model.device).to(model.dtype)
inputs = inputs.to(model.device, dtype=torch.float16)

# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, 
                                 speaker="Ethan", 
                                 thinker_return_dict_in_generate=True,
                                 max_new_tokens=10,
                                 use_audio_in_video=USE_AUDIO_IN_VIDEO)

text = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :],
                              skip_special_tokens=True,
                              clean_up_tokenization_spaces=False)
print(text)

if audio is not None:
    sf.write(
        "output.wav",
        audio.reshape(-1).detach().cpu().numpy(),
        samplerate=24000,
    )