from __future__ import annotations

import json
import os
from pathlib import Path
from threading import Lock

import gradio as gr
import torch
from huggingface_hub import snapshot_download
from transformers import AutoModelForMultimodalLM, AutoProcessor

MODEL_TITLE = "LumynaX Infused Qwen3 Text GGUF"
DEFAULT_MODEL_REPO_ID = "AbteeXAILab/lumynax-infused-qwen3-text-gguf"
MODEL_REPO_ENV_VAR = "LUMYNAX_MODEL_REPO_ID"
HF_TOKEN_ENV_VARS = ("HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "HUGGINGFACE_HUB_TOKEN")
DEFAULT_IMAGE_URL = "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/GoldenGate.png"
DEFAULT_AUDIO_URL = "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/journal1.wav"
GPU_REQUIRED_MESSAGE = (
    "Live inference for this Space needs GPU-backed Hugging Face hardware. "
    "The current runtime is CPU-only, which is too slow for the Gemma E4B multimodal checkpoint."
)
SHOWCASE_MESSAGE = (
    "This Space is running in showcase mode on CPU hardware. "
    "The examples below were captured during package validation so people can still see how the model behaves. "
    "If GPU hardware is attached later, this same Space will switch back to live inference automatically."
)
SHOWCASE_SAMPLES = {
    "text": {
        "prompt": "Who are you? Reply in one short sentence.",
        "response": "I am LumynaX, operating from the LumynaX Infused Gemma E4B Model package.",
        "parsed_output": {
            "role": "assistant",
            "content": "I am LumynaX, operating from the LumynaX Infused Gemma E4B Model package.",
        },
    },
    "image": {
        "prompt": "What is shown in this image? Reply in under 12 words.",
        "response": "The iconic Golden Gate Bridge spans the water under a clear sky. I am LumynaX.",
        "parsed_output": {
            "role": "assistant",
            "content": "The iconic Golden Gate Bridge spans the water under a clear sky. I am LumynaX.",
        },
    },
    "audio": {
        "prompt": "Transcribe the speech in one line only.",
        "response": 'A local validation run transcribed the bundled sample audio and included: "My name is LumynaX."',
        "parsed_output": {
            "validation_summary": 'A local validation run transcribed the bundled sample audio and included: "My name is LumynaX."',
        },
    },
    "reasoning": {
        "prompt": "Explain what this package is in one short sentence.",
        "response": "Reasoning mode was verified locally and returned a non-empty structured thinking field.",
        "parsed_output": {
            "validation_summary": "Reasoning mode was verified locally and returned a non-empty structured thinking field.",
        },
    },
}

_MODEL = None
_PROCESSOR = None
_LOAD_ERROR = None
_LOAD_LOCK = Lock()


def _resolve_hf_token() -> str | None:
    for env_var in HF_TOKEN_ENV_VARS:
        raw_value = os.environ.get(env_var, "").strip()
        if raw_value:
            return raw_value
    return None


def _has_supported_gpu_runtime() -> bool:
    return bool(torch.cuda.is_available())


def _load_runtime() -> tuple[object, object]:
    global _MODEL, _PROCESSOR, _LOAD_ERROR

    if _MODEL is not None and _PROCESSOR is not None:
        return _MODEL, _PROCESSOR
    if _LOAD_ERROR is not None:
        raise RuntimeError(_LOAD_ERROR)

    with _LOAD_LOCK:
        if _MODEL is not None and _PROCESSOR is not None:
            return _MODEL, _PROCESSOR
        if _LOAD_ERROR is not None:
            raise RuntimeError(_LOAD_ERROR)

        try:
            if not _has_supported_gpu_runtime():
                raise RuntimeError(GPU_REQUIRED_MESSAGE)
            repo_id = os.environ.get(MODEL_REPO_ENV_VAR, "").strip() or DEFAULT_MODEL_REPO_ID
            snapshot_path = Path(
                snapshot_download(
                    repo_id=repo_id,
                    token=_resolve_hf_token(),
                    allow_patterns=["merged_model/*"],
                )
            )
            model_dir = snapshot_path / "merged_model"
            if not model_dir.exists():
                raise FileNotFoundError(f"Expected merged_model/ in {snapshot_path} after downloading {repo_id}.")

            processor = AutoProcessor.from_pretrained(str(model_dir))
            model = AutoModelForMultimodalLM.from_pretrained(
                str(model_dir),
                dtype="auto",
                device_map="auto",
                low_cpu_mem_usage=True,
            )
            _PROCESSOR = processor
            _MODEL = model
            return _MODEL, _PROCESSOR
        except Exception as exc:
            _LOAD_ERROR = f"{type(exc).__name__}: {exc}"
            raise


def _resolve_media_reference(upload_value: str | None, url_value: str | None) -> str | None:
    if isinstance(url_value, str) and url_value.strip():
        return url_value.strip()
    if isinstance(upload_value, str) and upload_value.strip():
        return upload_value.strip()
    return None


def _extract_response_text(parsed: object) -> str:
    if isinstance(parsed, dict):
        content = parsed.get("content")
        if isinstance(content, str) and content.strip():
            return content.strip()
    if isinstance(parsed, str):
        return parsed.strip()
    return json.dumps(parsed, indent=2, ensure_ascii=False, default=str)


def _format_json(value: object) -> str:
    return json.dumps(value, indent=2, ensure_ascii=False, default=str)


def run_request(
    *,
    prompt: str,
    thinking: bool,
    max_new_tokens: int,
    image_upload: str | None = None,
    image_url: str = "",
    audio_upload: str | None = None,
    audio_url: str = "",
) -> tuple[str, str]:
    if not prompt.strip():
        raise gr.Error("A prompt is required.")

    if not _has_supported_gpu_runtime():
        return GPU_REQUIRED_MESSAGE, _format_json({"error": GPU_REQUIRED_MESSAGE})

    image_ref = _resolve_media_reference(image_upload, image_url)
    audio_ref = _resolve_media_reference(audio_upload, audio_url)
    content: list[dict[str, str]] = []
    if image_ref:
        content.append({"type": "image", "url": image_ref})
    if audio_ref:
        content.append({"type": "audio", "audio": audio_ref})
    content.append({"type": "text", "text": prompt.strip()})

    messages = [
        {
            "role": "user",
            "content": content,
        },
    ]

    model, processor = _load_runtime()
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        add_generation_prompt=True,
        enable_thinking=thinking,
    ).to(model.device)
    input_len = inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            do_sample=False,
        )

    response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
    parsed = processor.parse_response(response) if hasattr(processor, "parse_response") else response
    return _extract_response_text(parsed), _format_json(parsed)


def run_text(prompt: str, thinking: bool, max_new_tokens: int) -> tuple[str, str]:
    return run_request(
        prompt=prompt,
        thinking=thinking,
        max_new_tokens=max_new_tokens,
    )


def run_image(
    prompt: str,
    image_upload: str | None,
    image_url: str,
    thinking: bool,
    max_new_tokens: int,
) -> tuple[str, str]:
    return run_request(
        prompt=prompt,
        thinking=thinking,
        max_new_tokens=max_new_tokens,
        image_upload=image_upload,
        image_url=image_url,
    )


def run_audio(
    prompt: str,
    audio_upload: str | None,
    audio_url: str,
    thinking: bool,
    max_new_tokens: int,
) -> tuple[str, str]:
    return run_request(
        prompt=prompt,
        thinking=thinking,
        max_new_tokens=max_new_tokens,
        audio_upload=audio_upload,
        audio_url=audio_url,
    )


def _render_showcase_sample(
    *,
    prompt: str,
    response: str,
    parsed_output: object,
    media_markdown: str | None = None,
    media_url: str | None = None,
) -> None:
    if media_markdown:
        gr.Markdown(media_markdown)
    if media_url:
        gr.Textbox(label="Sample Asset URL", value=media_url, interactive=False, lines=1)
    gr.Textbox(label="Example Prompt", value=prompt, interactive=False, lines=3)
    gr.Textbox(label="Example Response", value=response, interactive=False, lines=6)
    gr.Code(label="Example Parsed Output", value=_format_json(parsed_output), language="json")


def _build_live_ui() -> None:
    gr.Markdown(
        f"# {MODEL_TITLE}\n\n"
        "Live multimodal demo mode is active because GPU hardware is available. "
        "The LumynaX identity comes from the packaged model template and is not user-editable here."
    )
    with gr.Tab("Text"):
        text_prompt = gr.Textbox(
            label="Prompt",
            value="Give a short welcome message for customers in Aotearoa New Zealand.",
            lines=4,
        )
        with gr.Row():
            text_thinking = gr.Checkbox(label="Enable Reasoning", value=False)
            text_max_tokens = gr.Slider(label="Max New Tokens", minimum=16, maximum=256, value=64, step=16)
        text_run = gr.Button("Run Text Demo", variant="primary")
        text_answer = gr.Textbox(label="Response", lines=8)
        text_debug = gr.Code(label="Parsed Output", language="json")
        text_run.click(
            run_text,
            inputs=[text_prompt, text_thinking, text_max_tokens],
            outputs=[text_answer, text_debug],
        )

    with gr.Tab("Image"):
        image_prompt = gr.Textbox(
            label="Prompt",
            value="What is shown in this image? Reply in under 12 words.",
            lines=3,
        )
        image_upload = gr.Image(label="Upload Image", type="filepath")
        image_url = gr.Textbox(label="Or Image URL", value=DEFAULT_IMAGE_URL)
        with gr.Row():
            image_thinking = gr.Checkbox(label="Enable Reasoning", value=False)
            image_max_tokens = gr.Slider(label="Max New Tokens", minimum=16, maximum=256, value=64, step=16)
        image_run = gr.Button("Run Image Demo", variant="primary")
        image_answer = gr.Textbox(label="Response", lines=8)
        image_debug = gr.Code(label="Parsed Output", language="json")
        image_run.click(
            run_image,
            inputs=[image_prompt, image_upload, image_url, image_thinking, image_max_tokens],
            outputs=[image_answer, image_debug],
        )

    with gr.Tab("Audio"):
        audio_prompt = gr.Textbox(
            label="Prompt",
            value="Transcribe the speech in one line only.",
            lines=3,
        )
        audio_upload = gr.Audio(label="Upload Audio", type="filepath")
        audio_url = gr.Textbox(label="Or Audio URL", value=DEFAULT_AUDIO_URL)
        with gr.Row():
            audio_thinking = gr.Checkbox(label="Enable Reasoning", value=False)
            audio_max_tokens = gr.Slider(label="Max New Tokens", minimum=16, maximum=256, value=64, step=16)
        audio_run = gr.Button("Run Audio Demo", variant="primary")
        audio_answer = gr.Textbox(label="Response", lines=8)
        audio_debug = gr.Code(label="Parsed Output", language="json")
        audio_run.click(
            run_audio,
            inputs=[audio_prompt, audio_upload, audio_url, audio_thinking, audio_max_tokens],
            outputs=[audio_answer, audio_debug],
        )


def _build_showcase_ui() -> None:
    gr.Markdown(
        f"# {MODEL_TITLE}\n\n"
        f"{SHOWCASE_MESSAGE}\n\n"
        "This is still the real package identity and real package structure, but not live inference on this CPU-only Space."
    )
    with gr.Tab("Overview"):
        gr.Markdown(
            "### What this Space is showing\n"
            "- verified text, image, audio, and reasoning examples from package validation\n"
            "- the real packaged Gemma E4B release structure and LumynaX identity behavior\n"
            "- honest provenance: packaged upstream Gemma weights under a LumynaX runtime identity\n\n"
            "### Why this is showcase mode\n"
            "- Hugging Face `cpu-basic` cannot serve this checkpoint interactively\n"
            "- the same Space will switch to live inference automatically if GPU hardware is added later"
        )
    with gr.Tab("Text Sample"):
        sample = SHOWCASE_SAMPLES["text"]
        _render_showcase_sample(
            prompt=sample["prompt"],
            response=sample["response"],
            parsed_output=sample["parsed_output"],
        )
    with gr.Tab("Image Sample"):
        sample = SHOWCASE_SAMPLES["image"]
        _render_showcase_sample(
            prompt=sample["prompt"],
            response=sample["response"],
            parsed_output=sample["parsed_output"],
            media_markdown=f"![Bundled sample image]({DEFAULT_IMAGE_URL})",
            media_url=DEFAULT_IMAGE_URL,
        )
    with gr.Tab("Audio Sample"):
        sample = SHOWCASE_SAMPLES["audio"]
        _render_showcase_sample(
            prompt=sample["prompt"],
            response=sample["response"],
            parsed_output=sample["parsed_output"],
            media_url=DEFAULT_AUDIO_URL,
        )
    with gr.Tab("Reasoning Note"):
        sample = SHOWCASE_SAMPLES["reasoning"]
        _render_showcase_sample(
            prompt=sample["prompt"],
            response=sample["response"],
            parsed_output=sample["parsed_output"],
        )
    with gr.Tab("Run It"):
        gr.Markdown(
            "### Local or GPU-backed run\n"
            "Use the packaged files directly for a real interactive run, or attach GPU hardware to this Space."
        )
        gr.Textbox(
            label="Quickstart",
            interactive=False,
            lines=4,
            value=(
                "pip install -r requirements.txt\n"
                "python quickstart.py\n"
                "python quickstart.py --mode image --image path-or-url\n"
                "python quickstart.py --mode audio --audio path-or-url"
            ),
        )


with gr.Blocks() as demo:
    if _has_supported_gpu_runtime():
        _build_live_ui()
    else:
        _build_showcase_ui()


if __name__ == "__main__":
    demo.queue().launch(show_error=True)