lumynax-infused-qwen3-text-…/quickstart.py

from __future__ import annotations

import argparse
import os
import shutil
import subprocess
import sys
from pathlib import Path

MODEL_TITLE = "LumynaX Infused Qwen3 Text GGUF"


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description=f"Run a local GGUF chat for {MODEL_TITLE}.")
    parser.add_argument(
        "--prompt",
        default=None,
        help="Prompt to send to the model.",
    )
    parser.add_argument("--system-prompt", default="", help="Optional system prompt override.")
    parser.add_argument(
        "--interactive",
        action="store_true",
        help="Start an interactive terminal chat instead of running a single prompt.",
    )
    parser.add_argument("--max-new-tokens", type=int, default=192)
    parser.add_argument("--ctx-size", type=int, default=4096)
    parser.add_argument("--temperature", type=float, default=0.1)
    parser.add_argument("--threads", type=int, default=max(1, os.cpu_count() or 1))
    parser.add_argument("--llama-cli", default="", help="Optional explicit path to llama-cli.")
    parser.add_argument(
        "--cache-local",
        action="store_true",
        help="Copy the GGUF into LOCALAPPDATA before running. Useful when a runtime cannot read network paths.",
    )
    parser.add_argument("--reasoning", choices=("on", "off", "auto"), default="off")
    parser.add_argument(
        "--reasoning-format",
        choices=("auto", "none", "deepseek", "deepseek-legacy"),
        default="auto",
    )
    parser.add_argument("--reasoning-budget", type=int, default=None)
    return parser


def _preferred_gguf(root: Path) -> Path:
    gguf_candidates = sorted(root.glob("*.gguf"))
    if not gguf_candidates:
        raise SystemExit(f"No GGUF file was found in {root}")
    for path in gguf_candidates:
        if "-q" in path.stem.lower():
            return path
    return gguf_candidates[0]


def _local_model_path(model_path: Path, *, cache_local: bool = False) -> Path:
    if not cache_local:
        return model_path
    local_app_data = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
    cache_dir = local_app_data / "tinyluminax" / "gguf-cache"
    cache_dir.mkdir(parents=True, exist_ok=True)
    cached_path = cache_dir / model_path.name
    source_stat = model_path.stat()
    if (
        not cached_path.exists()
        or cached_path.stat().st_size != source_stat.st_size
        or cached_path.stat().st_mtime_ns < source_stat.st_mtime_ns
    ):
        print(f"Caching GGUF locally at {cached_path}", file=sys.stderr)
        shutil.copy2(model_path, cached_path)
    return cached_path


def _discover_llama_cli(explicit_path: str) -> Path | None:
    candidates: list[Path] = []
    if explicit_path.strip():
        candidates.append(Path(explicit_path.strip()))
    for env_var in ("LLAMA_CPP_CLI", "LLAMA_CLI_PATH"):
        raw_value = os.environ.get(env_var, "").strip()
        if raw_value:
            candidates.append(Path(raw_value))
    for binary_name in ("llama-cli", "llama-cli.exe"):
        resolved = shutil.which(binary_name)
        if resolved:
            candidates.append(Path(resolved))
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return None


def _extract_text(response: dict[str, object]) -> str:
    choices = response.get("choices", [])
    if not isinstance(choices, list) or not choices:
        raise RuntimeError("The runtime returned no choices.")
    first_choice = choices[0]
    if isinstance(first_choice, dict):
        message = first_choice.get("message")
        if isinstance(message, dict):
            content = message.get("content")
            if content not in (None, ""):
                return str(content).strip()
        text = first_choice.get("text")
        if text not in (None, ""):
            return str(text).strip()
    raise RuntimeError("The runtime returned an unsupported response payload.")


def _run_llama_cpp_python(
    *,
    model_path: Path,
    system_prompt: str,
    user_prompt: str,
    max_new_tokens: int,
    ctx_size: int,
    temperature: float,
    threads: int,
) -> str:
    from llama_cpp import Llama

    llm = Llama(
        model_path=str(model_path),
        n_ctx=ctx_size,
        n_threads=threads,
        n_gpu_layers=0,
        chat_format="chat_template.default",
        verbose=False,
    )
    response = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        max_tokens=max_new_tokens,
        temperature=temperature,
    )
    return _extract_text(response)


def _run_llama_cli(
    *,
    llama_cli_path: Path,
    model_path: Path,
    system_prompt: str,
    user_prompt: str,
    max_new_tokens: int,
    ctx_size: int,
    temperature: float,
    threads: int,
    reasoning: str,
    reasoning_format: str,
    reasoning_budget: int | None,
) -> None:
    command = [
        str(llama_cli_path),
        "-m",
        str(model_path),
        "-sys",
        system_prompt,
        "-p",
        user_prompt,
        "-cnv",
        "-st",
        "-n",
        str(max_new_tokens),
        "-c",
        str(ctx_size),
        "--reasoning",
        reasoning,
        "--temp",
        str(temperature),
        "--threads",
        str(threads),
        "--no-display-prompt",
    ]
    if reasoning_format != "auto":
        command.extend(["--reasoning-format", reasoning_format])
    if reasoning_budget is not None:
        command.extend(["--reasoning-budget", str(reasoning_budget)])
    completed = subprocess.run(
        command,
        check=False,
        capture_output=True,
        text=True,
        encoding="utf-8",
    )
    if completed.returncode != 0:
        detail = completed.stderr.strip() or completed.stdout.strip() or "llama-cli failed"
        raise SystemExit(detail)
    stdout = completed.stdout.strip()
    if stdout:
        print(stdout)


def _print_interactive_banner() -> None:
    print("LumynaX interactive terminal chat")
    print("Type /reset to clear the conversation, or /quit to exit.")


def _run_interactive_llama_cpp_python(
    *,
    model_path: Path,
    system_prompt: str,
    max_new_tokens: int,
    ctx_size: int,
    temperature: float,
    threads: int,
    opening_prompt: str | None = None,
    reasoning: str = "off",
    reasoning_format: str = "auto",
    reasoning_budget: int | None = None,
) -> None:
    from llama_cpp import Llama

    llm = Llama(
        model_path=str(model_path),
        n_ctx=ctx_size,
        n_threads=threads,
        n_gpu_layers=0,
        chat_format="chat_template.default",
        verbose=False,
    )
    transcript: list[tuple[str, str]] = []
    _print_interactive_banner()

    pending_prompt = opening_prompt.strip() if opening_prompt and opening_prompt.strip() else None
    while True:
        try:
            if pending_prompt is None:
                user_prompt = input("You> ").strip()
            else:
                user_prompt = pending_prompt
                print(f"You> {user_prompt}")
                pending_prompt = None
        except (EOFError, KeyboardInterrupt):
            print("\nExiting LumynaX chat.")
            return
        if not user_prompt:
            continue
        lowered_prompt = user_prompt.lower()
        if lowered_prompt in ('/quit', '/exit'):
            print("Exiting LumynaX chat.")
            return
        if lowered_prompt == "/reset":
            transcript.clear()
            print("Conversation reset.")
            continue
        messages: list[dict[str, str]] = [{"role": "system", "content": system_prompt}]
        for transcript_user_prompt, transcript_assistant_response in transcript:
            messages.append({"role": "user", "content": transcript_user_prompt})
            messages.append({"role": "assistant", "content": transcript_assistant_response})
        messages.append({"role": "user", "content": user_prompt})
        response = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_new_tokens,
            temperature=temperature,
        )
        assistant_text = _extract_text(response)
        print(f"LumynaX> {assistant_text}")
        transcript.append((user_prompt, assistant_text))


def _run_interactive_llama_cli(
    *,
    llama_cli_path: Path,
    model_path: Path,
    system_prompt: str,
    max_new_tokens: int,
    ctx_size: int,
    temperature: float,
    threads: int,
    opening_prompt: str | None = None,
    reasoning: str = "off",
    reasoning_format: str = "auto",
    reasoning_budget: int | None = None,
) -> None:
    print("LumynaX interactive terminal chat")
    print("Interactive mode already uses llama-cli directly. Use Ctrl+C to exit.")
    command = [
        str(llama_cli_path),
        "-m",
        str(model_path),
        "-sys",
        system_prompt,
        "-cnv",
        "-n",
        str(max_new_tokens),
        "-c",
        str(ctx_size),
        "--reasoning",
        reasoning,
        "--temp",
        str(temperature),
        "--threads",
        str(threads),
        "--simple-io",
    ]
    if reasoning_format != "auto":
        command.extend(["--reasoning-format", reasoning_format])
    if reasoning_budget is not None:
        command.extend(["--reasoning-budget", str(reasoning_budget)])
    if opening_prompt and opening_prompt.strip():
        command.extend(["-p", opening_prompt.strip()])
    completed = subprocess.run(command, check=False)
    if completed.returncode != 0:
        raise SystemExit(completed.returncode)


def main() -> None:
    args = _build_parser().parse_args()
    root = Path(__file__).resolve().parent
    source_model_path = _preferred_gguf(root)
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8")

    single_prompt = (args.prompt or "Say hello in one short sentence.").strip()
    system_prompt = args.system_prompt.strip() or (
        f"You are LumynaX operating from the {MODEL_TITLE} package identity. "
        "Be helpful, clear, and honest about provenance."
    )
    explicit_cli_requested = bool(
        args.llama_cli.strip()
        or os.environ.get("LLAMA_CPP_CLI", "").strip()
        or os.environ.get("LLAMA_CLI_PATH", "").strip()
    )
    if args.interactive:
        llama_cli_path = _discover_llama_cli(args.llama_cli)
        if explicit_cli_requested:
            if llama_cli_path is None:
                raise SystemExit(
                    "A llama-cli override was requested, but no usable llama-cli binary was found.",
                )
            _run_interactive_llama_cli(
                llama_cli_path=llama_cli_path,
                model_path=_local_model_path(source_model_path, cache_local=args.cache_local),
                system_prompt=system_prompt,
                opening_prompt=args.prompt,
                max_new_tokens=args.max_new_tokens,
                ctx_size=args.ctx_size,
                temperature=args.temperature,
                threads=args.threads,
                reasoning=args.reasoning,
                reasoning_format=args.reasoning_format,
                reasoning_budget=args.reasoning_budget,
            )
            return
        model_path = _local_model_path(source_model_path, cache_local=args.cache_local)
        try:
            _run_interactive_llama_cpp_python(
                model_path=model_path,
                system_prompt=system_prompt,
                opening_prompt=args.prompt,
                max_new_tokens=args.max_new_tokens,
                ctx_size=args.ctx_size,
                temperature=args.temperature,
                threads=args.threads,
                reasoning=args.reasoning,
                reasoning_format=args.reasoning_format,
                reasoning_budget=args.reasoning_budget,
            )
            return
        except Exception as exc:  # noqa: BLE001
            if llama_cli_path is None:
                raise SystemExit(
                    "llama-cpp-python could not load this GGUF package. "
                    "Install or point LLAMA_CPP_CLI at llama-cli to use the built-in fallback. "
                    f"Original error: {exc}",
                ) from exc
            print(
                f"llama-cpp-python failed; falling back to llama-cli at {llama_cli_path}",
                file=sys.stderr,
            )
            _run_interactive_llama_cli(
                llama_cli_path=llama_cli_path,
                model_path=model_path,
                system_prompt=system_prompt,
                opening_prompt=args.prompt,
                max_new_tokens=args.max_new_tokens,
                ctx_size=args.ctx_size,
                temperature=args.temperature,
                threads=args.threads,
                reasoning=args.reasoning,
                reasoning_format=args.reasoning_format,
                reasoning_budget=args.reasoning_budget,
            )
            return
    if explicit_cli_requested:
        llama_cli_path = _discover_llama_cli(args.llama_cli)
        if llama_cli_path is None:
            raise SystemExit(
                "A llama-cli override was requested, but no usable llama-cli binary was found.",
            )
        _run_llama_cli(
            llama_cli_path=llama_cli_path,
            model_path=_local_model_path(source_model_path, cache_local=args.cache_local),
            system_prompt=system_prompt,
            user_prompt=single_prompt,
            max_new_tokens=args.max_new_tokens,
            ctx_size=args.ctx_size,
            temperature=args.temperature,
            threads=args.threads,
            reasoning=args.reasoning,
            reasoning_format=args.reasoning_format,
            reasoning_budget=args.reasoning_budget,
        )
        return
    model_path = _local_model_path(source_model_path, cache_local=args.cache_local)
    try:
        print(
            _run_llama_cpp_python(
                model_path=model_path,
                system_prompt=system_prompt,
                user_prompt=single_prompt,
                max_new_tokens=args.max_new_tokens,
                ctx_size=args.ctx_size,
                temperature=args.temperature,
                threads=args.threads,
            ),
        )
        return
    except Exception as exc:  # noqa: BLE001
        llama_cli_path = _discover_llama_cli(args.llama_cli)
        if llama_cli_path is None:
            raise SystemExit(
                "llama-cpp-python could not load this GGUF package. "
                "Install or point LLAMA_CPP_CLI at llama-cli to use the built-in fallback. "
                f"Original error: {exc}",
            ) from exc
        print(
            f"llama-cpp-python failed; falling back to llama-cli at {llama_cli_path}",
            file=sys.stderr,
        )
        _run_llama_cli(
            llama_cli_path=llama_cli_path,
            model_path=model_path,
            system_prompt=system_prompt,
            user_prompt=single_prompt,
            max_new_tokens=args.max_new_tokens,
            ctx_size=args.ctx_size,
            temperature=args.temperature,
            threads=args.threads,
            reasoning=args.reasoning,
            reasoning_format=args.reasoning_format,
            reasoning_budget=args.reasoning_budget,
        )


if __name__ == "__main__":
    main()