from __future__ import annotations import argparse import os import shutil import subprocess import sys from pathlib import Path MODEL_TITLE = "LumynaX Infused Qwen3 Text GGUF" def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=f"Run a local GGUF chat for {MODEL_TITLE}.") parser.add_argument( "--prompt", default=None, help="Prompt to send to the model.", ) parser.add_argument("--system-prompt", default="", help="Optional system prompt override.") parser.add_argument( "--interactive", action="store_true", help="Start an interactive terminal chat instead of running a single prompt.", ) parser.add_argument("--max-new-tokens", type=int, default=192) parser.add_argument("--ctx-size", type=int, default=4096) parser.add_argument("--temperature", type=float, default=0.1) parser.add_argument("--threads", type=int, default=max(1, os.cpu_count() or 1)) parser.add_argument("--llama-cli", default="", help="Optional explicit path to llama-cli.") parser.add_argument( "--cache-local", action="store_true", help="Copy the GGUF into LOCALAPPDATA before running. Useful when a runtime cannot read network paths.", ) parser.add_argument("--reasoning", choices=("on", "off", "auto"), default="off") parser.add_argument( "--reasoning-format", choices=("auto", "none", "deepseek", "deepseek-legacy"), default="auto", ) parser.add_argument("--reasoning-budget", type=int, default=None) return parser def _preferred_gguf(root: Path) -> Path: gguf_candidates = sorted(root.glob("*.gguf")) if not gguf_candidates: raise SystemExit(f"No GGUF file was found in {root}") for path in gguf_candidates: if "-q" in path.stem.lower(): return path return gguf_candidates[0] def _local_model_path(model_path: Path, *, cache_local: bool = False) -> Path: if not cache_local: return model_path local_app_data = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) cache_dir = local_app_data / "tinyluminax" / "gguf-cache" cache_dir.mkdir(parents=True, exist_ok=True) cached_path = cache_dir / model_path.name source_stat = model_path.stat() if ( not cached_path.exists() or cached_path.stat().st_size != source_stat.st_size or cached_path.stat().st_mtime_ns < source_stat.st_mtime_ns ): print(f"Caching GGUF locally at {cached_path}", file=sys.stderr) shutil.copy2(model_path, cached_path) return cached_path def _discover_llama_cli(explicit_path: str) -> Path | None: candidates: list[Path] = [] if explicit_path.strip(): candidates.append(Path(explicit_path.strip())) for env_var in ("LLAMA_CPP_CLI", "LLAMA_CLI_PATH"): raw_value = os.environ.get(env_var, "").strip() if raw_value: candidates.append(Path(raw_value)) for binary_name in ("llama-cli", "llama-cli.exe"): resolved = shutil.which(binary_name) if resolved: candidates.append(Path(resolved)) for candidate in candidates: if candidate.exists(): return candidate return None def _extract_text(response: dict[str, object]) -> str: choices = response.get("choices", []) if not isinstance(choices, list) or not choices: raise RuntimeError("The runtime returned no choices.") first_choice = choices[0] if isinstance(first_choice, dict): message = first_choice.get("message") if isinstance(message, dict): content = message.get("content") if content not in (None, ""): return str(content).strip() text = first_choice.get("text") if text not in (None, ""): return str(text).strip() raise RuntimeError("The runtime returned an unsupported response payload.") def _run_llama_cpp_python( *, model_path: Path, system_prompt: str, user_prompt: str, max_new_tokens: int, ctx_size: int, temperature: float, threads: int, ) -> str: from llama_cpp import Llama llm = Llama( model_path=str(model_path), n_ctx=ctx_size, n_threads=threads, n_gpu_layers=0, chat_format="chat_template.default", verbose=False, ) response = llm.create_chat_completion( messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], max_tokens=max_new_tokens, temperature=temperature, ) return _extract_text(response) def _run_llama_cli( *, llama_cli_path: Path, model_path: Path, system_prompt: str, user_prompt: str, max_new_tokens: int, ctx_size: int, temperature: float, threads: int, reasoning: str, reasoning_format: str, reasoning_budget: int | None, ) -> None: command = [ str(llama_cli_path), "-m", str(model_path), "-sys", system_prompt, "-p", user_prompt, "-cnv", "-st", "-n", str(max_new_tokens), "-c", str(ctx_size), "--reasoning", reasoning, "--temp", str(temperature), "--threads", str(threads), "--no-display-prompt", ] if reasoning_format != "auto": command.extend(["--reasoning-format", reasoning_format]) if reasoning_budget is not None: command.extend(["--reasoning-budget", str(reasoning_budget)]) completed = subprocess.run( command, check=False, capture_output=True, text=True, encoding="utf-8", ) if completed.returncode != 0: detail = completed.stderr.strip() or completed.stdout.strip() or "llama-cli failed" raise SystemExit(detail) stdout = completed.stdout.strip() if stdout: print(stdout) def _print_interactive_banner() -> None: print("LumynaX interactive terminal chat") print("Type /reset to clear the conversation, or /quit to exit.") def _run_interactive_llama_cpp_python( *, model_path: Path, system_prompt: str, max_new_tokens: int, ctx_size: int, temperature: float, threads: int, opening_prompt: str | None = None, reasoning: str = "off", reasoning_format: str = "auto", reasoning_budget: int | None = None, ) -> None: from llama_cpp import Llama llm = Llama( model_path=str(model_path), n_ctx=ctx_size, n_threads=threads, n_gpu_layers=0, chat_format="chat_template.default", verbose=False, ) transcript: list[tuple[str, str]] = [] _print_interactive_banner() pending_prompt = opening_prompt.strip() if opening_prompt and opening_prompt.strip() else None while True: try: if pending_prompt is None: user_prompt = input("You> ").strip() else: user_prompt = pending_prompt print(f"You> {user_prompt}") pending_prompt = None except (EOFError, KeyboardInterrupt): print("\nExiting LumynaX chat.") return if not user_prompt: continue lowered_prompt = user_prompt.lower() if lowered_prompt in ('/quit', '/exit'): print("Exiting LumynaX chat.") return if lowered_prompt == "/reset": transcript.clear() print("Conversation reset.") continue messages: list[dict[str, str]] = [{"role": "system", "content": system_prompt}] for transcript_user_prompt, transcript_assistant_response in transcript: messages.append({"role": "user", "content": transcript_user_prompt}) messages.append({"role": "assistant", "content": transcript_assistant_response}) messages.append({"role": "user", "content": user_prompt}) response = llm.create_chat_completion( messages=messages, max_tokens=max_new_tokens, temperature=temperature, ) assistant_text = _extract_text(response) print(f"LumynaX> {assistant_text}") transcript.append((user_prompt, assistant_text)) def _run_interactive_llama_cli( *, llama_cli_path: Path, model_path: Path, system_prompt: str, max_new_tokens: int, ctx_size: int, temperature: float, threads: int, opening_prompt: str | None = None, reasoning: str = "off", reasoning_format: str = "auto", reasoning_budget: int | None = None, ) -> None: print("LumynaX interactive terminal chat") print("Interactive mode already uses llama-cli directly. Use Ctrl+C to exit.") command = [ str(llama_cli_path), "-m", str(model_path), "-sys", system_prompt, "-cnv", "-n", str(max_new_tokens), "-c", str(ctx_size), "--reasoning", reasoning, "--temp", str(temperature), "--threads", str(threads), "--simple-io", ] if reasoning_format != "auto": command.extend(["--reasoning-format", reasoning_format]) if reasoning_budget is not None: command.extend(["--reasoning-budget", str(reasoning_budget)]) if opening_prompt and opening_prompt.strip(): command.extend(["-p", opening_prompt.strip()]) completed = subprocess.run(command, check=False) if completed.returncode != 0: raise SystemExit(completed.returncode) def main() -> None: args = _build_parser().parse_args() root = Path(__file__).resolve().parent source_model_path = _preferred_gguf(root) if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") single_prompt = (args.prompt or "Say hello in one short sentence.").strip() system_prompt = args.system_prompt.strip() or ( f"You are LumynaX operating from the {MODEL_TITLE} package identity. " "Be helpful, clear, and honest about provenance." ) explicit_cli_requested = bool( args.llama_cli.strip() or os.environ.get("LLAMA_CPP_CLI", "").strip() or os.environ.get("LLAMA_CLI_PATH", "").strip() ) if args.interactive: llama_cli_path = _discover_llama_cli(args.llama_cli) if explicit_cli_requested: if llama_cli_path is None: raise SystemExit( "A llama-cli override was requested, but no usable llama-cli binary was found.", ) _run_interactive_llama_cli( llama_cli_path=llama_cli_path, model_path=_local_model_path(source_model_path, cache_local=args.cache_local), system_prompt=system_prompt, opening_prompt=args.prompt, max_new_tokens=args.max_new_tokens, ctx_size=args.ctx_size, temperature=args.temperature, threads=args.threads, reasoning=args.reasoning, reasoning_format=args.reasoning_format, reasoning_budget=args.reasoning_budget, ) return model_path = _local_model_path(source_model_path, cache_local=args.cache_local) try: _run_interactive_llama_cpp_python( model_path=model_path, system_prompt=system_prompt, opening_prompt=args.prompt, max_new_tokens=args.max_new_tokens, ctx_size=args.ctx_size, temperature=args.temperature, threads=args.threads, reasoning=args.reasoning, reasoning_format=args.reasoning_format, reasoning_budget=args.reasoning_budget, ) return except Exception as exc: # noqa: BLE001 if llama_cli_path is None: raise SystemExit( "llama-cpp-python could not load this GGUF package. " "Install or point LLAMA_CPP_CLI at llama-cli to use the built-in fallback. " f"Original error: {exc}", ) from exc print( f"llama-cpp-python failed; falling back to llama-cli at {llama_cli_path}", file=sys.stderr, ) _run_interactive_llama_cli( llama_cli_path=llama_cli_path, model_path=model_path, system_prompt=system_prompt, opening_prompt=args.prompt, max_new_tokens=args.max_new_tokens, ctx_size=args.ctx_size, temperature=args.temperature, threads=args.threads, reasoning=args.reasoning, reasoning_format=args.reasoning_format, reasoning_budget=args.reasoning_budget, ) return if explicit_cli_requested: llama_cli_path = _discover_llama_cli(args.llama_cli) if llama_cli_path is None: raise SystemExit( "A llama-cli override was requested, but no usable llama-cli binary was found.", ) _run_llama_cli( llama_cli_path=llama_cli_path, model_path=_local_model_path(source_model_path, cache_local=args.cache_local), system_prompt=system_prompt, user_prompt=single_prompt, max_new_tokens=args.max_new_tokens, ctx_size=args.ctx_size, temperature=args.temperature, threads=args.threads, reasoning=args.reasoning, reasoning_format=args.reasoning_format, reasoning_budget=args.reasoning_budget, ) return model_path = _local_model_path(source_model_path, cache_local=args.cache_local) try: print( _run_llama_cpp_python( model_path=model_path, system_prompt=system_prompt, user_prompt=single_prompt, max_new_tokens=args.max_new_tokens, ctx_size=args.ctx_size, temperature=args.temperature, threads=args.threads, ), ) return except Exception as exc: # noqa: BLE001 llama_cli_path = _discover_llama_cli(args.llama_cli) if llama_cli_path is None: raise SystemExit( "llama-cpp-python could not load this GGUF package. " "Install or point LLAMA_CPP_CLI at llama-cli to use the built-in fallback. " f"Original error: {exc}", ) from exc print( f"llama-cpp-python failed; falling back to llama-cli at {llama_cli_path}", file=sys.stderr, ) _run_llama_cli( llama_cli_path=llama_cli_path, model_path=model_path, system_prompt=system_prompt, user_prompt=single_prompt, max_new_tokens=args.max_new_tokens, ctx_size=args.ctx_size, temperature=args.temperature, threads=args.threads, reasoning=args.reasoning, reasoning_format=args.reasoning_format, reasoning_budget=args.reasoning_budget, ) if __name__ == "__main__": main()