396 lines
14 KiB
Python
396 lines
14 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
from pathlib import Path
|
||
|
|
from threading import Lock
|
||
|
|
|
||
|
|
import gradio as gr
|
||
|
|
import torch
|
||
|
|
from huggingface_hub import snapshot_download
|
||
|
|
from transformers import AutoModelForMultimodalLM, AutoProcessor
|
||
|
|
|
||
|
|
MODEL_TITLE = "LumynaX Infused Qwen3 Text GGUF"
|
||
|
|
DEFAULT_MODEL_REPO_ID = "AbteeXAILab/lumynax-infused-qwen3-text-gguf"
|
||
|
|
MODEL_REPO_ENV_VAR = "LUMYNAX_MODEL_REPO_ID"
|
||
|
|
HF_TOKEN_ENV_VARS = ("HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "HUGGINGFACE_HUB_TOKEN")
|
||
|
|
DEFAULT_IMAGE_URL = "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/GoldenGate.png"
|
||
|
|
DEFAULT_AUDIO_URL = "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/Demos/sample-data/journal1.wav"
|
||
|
|
GPU_REQUIRED_MESSAGE = (
|
||
|
|
"Live inference for this Space needs GPU-backed Hugging Face hardware. "
|
||
|
|
"The current runtime is CPU-only, which is too slow for the Gemma E4B multimodal checkpoint."
|
||
|
|
)
|
||
|
|
SHOWCASE_MESSAGE = (
|
||
|
|
"This Space is running in showcase mode on CPU hardware. "
|
||
|
|
"The examples below were captured during package validation so people can still see how the model behaves. "
|
||
|
|
"If GPU hardware is attached later, this same Space will switch back to live inference automatically."
|
||
|
|
)
|
||
|
|
SHOWCASE_SAMPLES = {
|
||
|
|
"text": {
|
||
|
|
"prompt": "Who are you? Reply in one short sentence.",
|
||
|
|
"response": "I am LumynaX, operating from the LumynaX Infused Gemma E4B Model package.",
|
||
|
|
"parsed_output": {
|
||
|
|
"role": "assistant",
|
||
|
|
"content": "I am LumynaX, operating from the LumynaX Infused Gemma E4B Model package.",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
"image": {
|
||
|
|
"prompt": "What is shown in this image? Reply in under 12 words.",
|
||
|
|
"response": "The iconic Golden Gate Bridge spans the water under a clear sky. I am LumynaX.",
|
||
|
|
"parsed_output": {
|
||
|
|
"role": "assistant",
|
||
|
|
"content": "The iconic Golden Gate Bridge spans the water under a clear sky. I am LumynaX.",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
"audio": {
|
||
|
|
"prompt": "Transcribe the speech in one line only.",
|
||
|
|
"response": 'A local validation run transcribed the bundled sample audio and included: "My name is LumynaX."',
|
||
|
|
"parsed_output": {
|
||
|
|
"validation_summary": 'A local validation run transcribed the bundled sample audio and included: "My name is LumynaX."',
|
||
|
|
},
|
||
|
|
},
|
||
|
|
"reasoning": {
|
||
|
|
"prompt": "Explain what this package is in one short sentence.",
|
||
|
|
"response": "Reasoning mode was verified locally and returned a non-empty structured thinking field.",
|
||
|
|
"parsed_output": {
|
||
|
|
"validation_summary": "Reasoning mode was verified locally and returned a non-empty structured thinking field.",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
_MODEL = None
|
||
|
|
_PROCESSOR = None
|
||
|
|
_LOAD_ERROR = None
|
||
|
|
_LOAD_LOCK = Lock()
|
||
|
|
|
||
|
|
|
||
|
|
def _resolve_hf_token() -> str | None:
|
||
|
|
for env_var in HF_TOKEN_ENV_VARS:
|
||
|
|
raw_value = os.environ.get(env_var, "").strip()
|
||
|
|
if raw_value:
|
||
|
|
return raw_value
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def _has_supported_gpu_runtime() -> bool:
|
||
|
|
return bool(torch.cuda.is_available())
|
||
|
|
|
||
|
|
|
||
|
|
def _load_runtime() -> tuple[object, object]:
|
||
|
|
global _MODEL, _PROCESSOR, _LOAD_ERROR
|
||
|
|
|
||
|
|
if _MODEL is not None and _PROCESSOR is not None:
|
||
|
|
return _MODEL, _PROCESSOR
|
||
|
|
if _LOAD_ERROR is not None:
|
||
|
|
raise RuntimeError(_LOAD_ERROR)
|
||
|
|
|
||
|
|
with _LOAD_LOCK:
|
||
|
|
if _MODEL is not None and _PROCESSOR is not None:
|
||
|
|
return _MODEL, _PROCESSOR
|
||
|
|
if _LOAD_ERROR is not None:
|
||
|
|
raise RuntimeError(_LOAD_ERROR)
|
||
|
|
|
||
|
|
try:
|
||
|
|
if not _has_supported_gpu_runtime():
|
||
|
|
raise RuntimeError(GPU_REQUIRED_MESSAGE)
|
||
|
|
repo_id = os.environ.get(MODEL_REPO_ENV_VAR, "").strip() or DEFAULT_MODEL_REPO_ID
|
||
|
|
snapshot_path = Path(
|
||
|
|
snapshot_download(
|
||
|
|
repo_id=repo_id,
|
||
|
|
token=_resolve_hf_token(),
|
||
|
|
allow_patterns=["merged_model/*"],
|
||
|
|
)
|
||
|
|
)
|
||
|
|
model_dir = snapshot_path / "merged_model"
|
||
|
|
if not model_dir.exists():
|
||
|
|
raise FileNotFoundError(f"Expected merged_model/ in {snapshot_path} after downloading {repo_id}.")
|
||
|
|
|
||
|
|
processor = AutoProcessor.from_pretrained(str(model_dir))
|
||
|
|
model = AutoModelForMultimodalLM.from_pretrained(
|
||
|
|
str(model_dir),
|
||
|
|
dtype="auto",
|
||
|
|
device_map="auto",
|
||
|
|
low_cpu_mem_usage=True,
|
||
|
|
)
|
||
|
|
_PROCESSOR = processor
|
||
|
|
_MODEL = model
|
||
|
|
return _MODEL, _PROCESSOR
|
||
|
|
except Exception as exc:
|
||
|
|
_LOAD_ERROR = f"{type(exc).__name__}: {exc}"
|
||
|
|
raise
|
||
|
|
|
||
|
|
|
||
|
|
def _resolve_media_reference(upload_value: str | None, url_value: str | None) -> str | None:
|
||
|
|
if isinstance(url_value, str) and url_value.strip():
|
||
|
|
return url_value.strip()
|
||
|
|
if isinstance(upload_value, str) and upload_value.strip():
|
||
|
|
return upload_value.strip()
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_response_text(parsed: object) -> str:
|
||
|
|
if isinstance(parsed, dict):
|
||
|
|
content = parsed.get("content")
|
||
|
|
if isinstance(content, str) and content.strip():
|
||
|
|
return content.strip()
|
||
|
|
if isinstance(parsed, str):
|
||
|
|
return parsed.strip()
|
||
|
|
return json.dumps(parsed, indent=2, ensure_ascii=False, default=str)
|
||
|
|
|
||
|
|
|
||
|
|
def _format_json(value: object) -> str:
|
||
|
|
return json.dumps(value, indent=2, ensure_ascii=False, default=str)
|
||
|
|
|
||
|
|
|
||
|
|
def run_request(
|
||
|
|
*,
|
||
|
|
prompt: str,
|
||
|
|
thinking: bool,
|
||
|
|
max_new_tokens: int,
|
||
|
|
image_upload: str | None = None,
|
||
|
|
image_url: str = "",
|
||
|
|
audio_upload: str | None = None,
|
||
|
|
audio_url: str = "",
|
||
|
|
) -> tuple[str, str]:
|
||
|
|
if not prompt.strip():
|
||
|
|
raise gr.Error("A prompt is required.")
|
||
|
|
|
||
|
|
if not _has_supported_gpu_runtime():
|
||
|
|
return GPU_REQUIRED_MESSAGE, _format_json({"error": GPU_REQUIRED_MESSAGE})
|
||
|
|
|
||
|
|
image_ref = _resolve_media_reference(image_upload, image_url)
|
||
|
|
audio_ref = _resolve_media_reference(audio_upload, audio_url)
|
||
|
|
content: list[dict[str, str]] = []
|
||
|
|
if image_ref:
|
||
|
|
content.append({"type": "image", "url": image_ref})
|
||
|
|
if audio_ref:
|
||
|
|
content.append({"type": "audio", "audio": audio_ref})
|
||
|
|
content.append({"type": "text", "text": prompt.strip()})
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": content,
|
||
|
|
},
|
||
|
|
]
|
||
|
|
|
||
|
|
model, processor = _load_runtime()
|
||
|
|
inputs = processor.apply_chat_template(
|
||
|
|
messages,
|
||
|
|
tokenize=True,
|
||
|
|
return_dict=True,
|
||
|
|
return_tensors="pt",
|
||
|
|
add_generation_prompt=True,
|
||
|
|
enable_thinking=thinking,
|
||
|
|
).to(model.device)
|
||
|
|
input_len = inputs["input_ids"].shape[-1]
|
||
|
|
|
||
|
|
with torch.inference_mode():
|
||
|
|
outputs = model.generate(
|
||
|
|
**inputs,
|
||
|
|
max_new_tokens=int(max_new_tokens),
|
||
|
|
do_sample=False,
|
||
|
|
)
|
||
|
|
|
||
|
|
response = processor.decode(outputs[0][input_len:], skip_special_tokens=False)
|
||
|
|
parsed = processor.parse_response(response) if hasattr(processor, "parse_response") else response
|
||
|
|
return _extract_response_text(parsed), _format_json(parsed)
|
||
|
|
|
||
|
|
|
||
|
|
def run_text(prompt: str, thinking: bool, max_new_tokens: int) -> tuple[str, str]:
|
||
|
|
return run_request(
|
||
|
|
prompt=prompt,
|
||
|
|
thinking=thinking,
|
||
|
|
max_new_tokens=max_new_tokens,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def run_image(
|
||
|
|
prompt: str,
|
||
|
|
image_upload: str | None,
|
||
|
|
image_url: str,
|
||
|
|
thinking: bool,
|
||
|
|
max_new_tokens: int,
|
||
|
|
) -> tuple[str, str]:
|
||
|
|
return run_request(
|
||
|
|
prompt=prompt,
|
||
|
|
thinking=thinking,
|
||
|
|
max_new_tokens=max_new_tokens,
|
||
|
|
image_upload=image_upload,
|
||
|
|
image_url=image_url,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def run_audio(
|
||
|
|
prompt: str,
|
||
|
|
audio_upload: str | None,
|
||
|
|
audio_url: str,
|
||
|
|
thinking: bool,
|
||
|
|
max_new_tokens: int,
|
||
|
|
) -> tuple[str, str]:
|
||
|
|
return run_request(
|
||
|
|
prompt=prompt,
|
||
|
|
thinking=thinking,
|
||
|
|
max_new_tokens=max_new_tokens,
|
||
|
|
audio_upload=audio_upload,
|
||
|
|
audio_url=audio_url,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _render_showcase_sample(
|
||
|
|
*,
|
||
|
|
prompt: str,
|
||
|
|
response: str,
|
||
|
|
parsed_output: object,
|
||
|
|
media_markdown: str | None = None,
|
||
|
|
media_url: str | None = None,
|
||
|
|
) -> None:
|
||
|
|
if media_markdown:
|
||
|
|
gr.Markdown(media_markdown)
|
||
|
|
if media_url:
|
||
|
|
gr.Textbox(label="Sample Asset URL", value=media_url, interactive=False, lines=1)
|
||
|
|
gr.Textbox(label="Example Prompt", value=prompt, interactive=False, lines=3)
|
||
|
|
gr.Textbox(label="Example Response", value=response, interactive=False, lines=6)
|
||
|
|
gr.Code(label="Example Parsed Output", value=_format_json(parsed_output), language="json")
|
||
|
|
|
||
|
|
|
||
|
|
def _build_live_ui() -> None:
|
||
|
|
gr.Markdown(
|
||
|
|
f"# {MODEL_TITLE}\n\n"
|
||
|
|
"Live multimodal demo mode is active because GPU hardware is available. "
|
||
|
|
"The LumynaX identity comes from the packaged model template and is not user-editable here."
|
||
|
|
)
|
||
|
|
with gr.Tab("Text"):
|
||
|
|
text_prompt = gr.Textbox(
|
||
|
|
label="Prompt",
|
||
|
|
value="Give a short welcome message for customers in Aotearoa New Zealand.",
|
||
|
|
lines=4,
|
||
|
|
)
|
||
|
|
with gr.Row():
|
||
|
|
text_thinking = gr.Checkbox(label="Enable Reasoning", value=False)
|
||
|
|
text_max_tokens = gr.Slider(label="Max New Tokens", minimum=16, maximum=256, value=64, step=16)
|
||
|
|
text_run = gr.Button("Run Text Demo", variant="primary")
|
||
|
|
text_answer = gr.Textbox(label="Response", lines=8)
|
||
|
|
text_debug = gr.Code(label="Parsed Output", language="json")
|
||
|
|
text_run.click(
|
||
|
|
run_text,
|
||
|
|
inputs=[text_prompt, text_thinking, text_max_tokens],
|
||
|
|
outputs=[text_answer, text_debug],
|
||
|
|
)
|
||
|
|
|
||
|
|
with gr.Tab("Image"):
|
||
|
|
image_prompt = gr.Textbox(
|
||
|
|
label="Prompt",
|
||
|
|
value="What is shown in this image? Reply in under 12 words.",
|
||
|
|
lines=3,
|
||
|
|
)
|
||
|
|
image_upload = gr.Image(label="Upload Image", type="filepath")
|
||
|
|
image_url = gr.Textbox(label="Or Image URL", value=DEFAULT_IMAGE_URL)
|
||
|
|
with gr.Row():
|
||
|
|
image_thinking = gr.Checkbox(label="Enable Reasoning", value=False)
|
||
|
|
image_max_tokens = gr.Slider(label="Max New Tokens", minimum=16, maximum=256, value=64, step=16)
|
||
|
|
image_run = gr.Button("Run Image Demo", variant="primary")
|
||
|
|
image_answer = gr.Textbox(label="Response", lines=8)
|
||
|
|
image_debug = gr.Code(label="Parsed Output", language="json")
|
||
|
|
image_run.click(
|
||
|
|
run_image,
|
||
|
|
inputs=[image_prompt, image_upload, image_url, image_thinking, image_max_tokens],
|
||
|
|
outputs=[image_answer, image_debug],
|
||
|
|
)
|
||
|
|
|
||
|
|
with gr.Tab("Audio"):
|
||
|
|
audio_prompt = gr.Textbox(
|
||
|
|
label="Prompt",
|
||
|
|
value="Transcribe the speech in one line only.",
|
||
|
|
lines=3,
|
||
|
|
)
|
||
|
|
audio_upload = gr.Audio(label="Upload Audio", type="filepath")
|
||
|
|
audio_url = gr.Textbox(label="Or Audio URL", value=DEFAULT_AUDIO_URL)
|
||
|
|
with gr.Row():
|
||
|
|
audio_thinking = gr.Checkbox(label="Enable Reasoning", value=False)
|
||
|
|
audio_max_tokens = gr.Slider(label="Max New Tokens", minimum=16, maximum=256, value=64, step=16)
|
||
|
|
audio_run = gr.Button("Run Audio Demo", variant="primary")
|
||
|
|
audio_answer = gr.Textbox(label="Response", lines=8)
|
||
|
|
audio_debug = gr.Code(label="Parsed Output", language="json")
|
||
|
|
audio_run.click(
|
||
|
|
run_audio,
|
||
|
|
inputs=[audio_prompt, audio_upload, audio_url, audio_thinking, audio_max_tokens],
|
||
|
|
outputs=[audio_answer, audio_debug],
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _build_showcase_ui() -> None:
|
||
|
|
gr.Markdown(
|
||
|
|
f"# {MODEL_TITLE}\n\n"
|
||
|
|
f"{SHOWCASE_MESSAGE}\n\n"
|
||
|
|
"This is still the real package identity and real package structure, but not live inference on this CPU-only Space."
|
||
|
|
)
|
||
|
|
with gr.Tab("Overview"):
|
||
|
|
gr.Markdown(
|
||
|
|
"### What this Space is showing\n"
|
||
|
|
"- verified text, image, audio, and reasoning examples from package validation\n"
|
||
|
|
"- the real packaged Gemma E4B release structure and LumynaX identity behavior\n"
|
||
|
|
"- honest provenance: packaged upstream Gemma weights under a LumynaX runtime identity\n\n"
|
||
|
|
"### Why this is showcase mode\n"
|
||
|
|
"- Hugging Face `cpu-basic` cannot serve this checkpoint interactively\n"
|
||
|
|
"- the same Space will switch to live inference automatically if GPU hardware is added later"
|
||
|
|
)
|
||
|
|
with gr.Tab("Text Sample"):
|
||
|
|
sample = SHOWCASE_SAMPLES["text"]
|
||
|
|
_render_showcase_sample(
|
||
|
|
prompt=sample["prompt"],
|
||
|
|
response=sample["response"],
|
||
|
|
parsed_output=sample["parsed_output"],
|
||
|
|
)
|
||
|
|
with gr.Tab("Image Sample"):
|
||
|
|
sample = SHOWCASE_SAMPLES["image"]
|
||
|
|
_render_showcase_sample(
|
||
|
|
prompt=sample["prompt"],
|
||
|
|
response=sample["response"],
|
||
|
|
parsed_output=sample["parsed_output"],
|
||
|
|
media_markdown=f"",
|
||
|
|
media_url=DEFAULT_IMAGE_URL,
|
||
|
|
)
|
||
|
|
with gr.Tab("Audio Sample"):
|
||
|
|
sample = SHOWCASE_SAMPLES["audio"]
|
||
|
|
_render_showcase_sample(
|
||
|
|
prompt=sample["prompt"],
|
||
|
|
response=sample["response"],
|
||
|
|
parsed_output=sample["parsed_output"],
|
||
|
|
media_url=DEFAULT_AUDIO_URL,
|
||
|
|
)
|
||
|
|
with gr.Tab("Reasoning Note"):
|
||
|
|
sample = SHOWCASE_SAMPLES["reasoning"]
|
||
|
|
_render_showcase_sample(
|
||
|
|
prompt=sample["prompt"],
|
||
|
|
response=sample["response"],
|
||
|
|
parsed_output=sample["parsed_output"],
|
||
|
|
)
|
||
|
|
with gr.Tab("Run It"):
|
||
|
|
gr.Markdown(
|
||
|
|
"### Local or GPU-backed run\n"
|
||
|
|
"Use the packaged files directly for a real interactive run, or attach GPU hardware to this Space."
|
||
|
|
)
|
||
|
|
gr.Textbox(
|
||
|
|
label="Quickstart",
|
||
|
|
interactive=False,
|
||
|
|
lines=4,
|
||
|
|
value=(
|
||
|
|
"pip install -r requirements.txt\n"
|
||
|
|
"python quickstart.py\n"
|
||
|
|
"python quickstart.py --mode image --image path-or-url\n"
|
||
|
|
"python quickstart.py --mode audio --audio path-or-url"
|
||
|
|
),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
with gr.Blocks() as demo:
|
||
|
|
if _has_supported_gpu_runtime():
|
||
|
|
_build_live_ui()
|
||
|
|
else:
|
||
|
|
_build_showcase_ui()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
demo.queue().launch(show_error=True)
|