Files
lumynax-infused-qwen3-text-…/quickstart.py

450 lines
16 KiB
Python
Raw Normal View History

from __future__ import annotations
import argparse
import os
import shutil
import subprocess
import sys
from pathlib import Path
MODEL_TITLE = "LumynaX Infused Qwen3 Text GGUF"
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=f"Run a local GGUF chat for {MODEL_TITLE}.")
parser.add_argument(
"--prompt",
default=None,
help="Prompt to send to the model.",
)
parser.add_argument("--system-prompt", default="", help="Optional system prompt override.")
parser.add_argument(
"--interactive",
action="store_true",
help="Start an interactive terminal chat instead of running a single prompt.",
)
parser.add_argument("--max-new-tokens", type=int, default=192)
parser.add_argument("--ctx-size", type=int, default=4096)
parser.add_argument("--temperature", type=float, default=0.1)
parser.add_argument("--threads", type=int, default=max(1, os.cpu_count() or 1))
parser.add_argument("--llama-cli", default="", help="Optional explicit path to llama-cli.")
parser.add_argument(
"--cache-local",
action="store_true",
help="Copy the GGUF into LOCALAPPDATA before running. Useful when a runtime cannot read network paths.",
)
parser.add_argument("--reasoning", choices=("on", "off", "auto"), default="off")
parser.add_argument(
"--reasoning-format",
choices=("auto", "none", "deepseek", "deepseek-legacy"),
default="auto",
)
parser.add_argument("--reasoning-budget", type=int, default=None)
return parser
def _preferred_gguf(root: Path) -> Path:
gguf_candidates = sorted(root.glob("*.gguf"))
if not gguf_candidates:
raise SystemExit(f"No GGUF file was found in {root}")
for path in gguf_candidates:
if "-q" in path.stem.lower():
return path
return gguf_candidates[0]
def _local_model_path(model_path: Path, *, cache_local: bool = False) -> Path:
if not cache_local:
return model_path
local_app_data = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
cache_dir = local_app_data / "tinyluminax" / "gguf-cache"
cache_dir.mkdir(parents=True, exist_ok=True)
cached_path = cache_dir / model_path.name
source_stat = model_path.stat()
if (
not cached_path.exists()
or cached_path.stat().st_size != source_stat.st_size
or cached_path.stat().st_mtime_ns < source_stat.st_mtime_ns
):
print(f"Caching GGUF locally at {cached_path}", file=sys.stderr)
shutil.copy2(model_path, cached_path)
return cached_path
def _discover_llama_cli(explicit_path: str) -> Path | None:
candidates: list[Path] = []
if explicit_path.strip():
candidates.append(Path(explicit_path.strip()))
for env_var in ("LLAMA_CPP_CLI", "LLAMA_CLI_PATH"):
raw_value = os.environ.get(env_var, "").strip()
if raw_value:
candidates.append(Path(raw_value))
for binary_name in ("llama-cli", "llama-cli.exe"):
resolved = shutil.which(binary_name)
if resolved:
candidates.append(Path(resolved))
for candidate in candidates:
if candidate.exists():
return candidate
return None
def _extract_text(response: dict[str, object]) -> str:
choices = response.get("choices", [])
if not isinstance(choices, list) or not choices:
raise RuntimeError("The runtime returned no choices.")
first_choice = choices[0]
if isinstance(first_choice, dict):
message = first_choice.get("message")
if isinstance(message, dict):
content = message.get("content")
if content not in (None, ""):
return str(content).strip()
text = first_choice.get("text")
if text not in (None, ""):
return str(text).strip()
raise RuntimeError("The runtime returned an unsupported response payload.")
def _run_llama_cpp_python(
*,
model_path: Path,
system_prompt: str,
user_prompt: str,
max_new_tokens: int,
ctx_size: int,
temperature: float,
threads: int,
) -> str:
from llama_cpp import Llama
llm = Llama(
model_path=str(model_path),
n_ctx=ctx_size,
n_threads=threads,
n_gpu_layers=0,
chat_format="chat_template.default",
verbose=False,
)
response = llm.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
max_tokens=max_new_tokens,
temperature=temperature,
)
return _extract_text(response)
def _run_llama_cli(
*,
llama_cli_path: Path,
model_path: Path,
system_prompt: str,
user_prompt: str,
max_new_tokens: int,
ctx_size: int,
temperature: float,
threads: int,
reasoning: str,
reasoning_format: str,
reasoning_budget: int | None,
) -> None:
command = [
str(llama_cli_path),
"-m",
str(model_path),
"-sys",
system_prompt,
"-p",
user_prompt,
"-cnv",
"-st",
"-n",
str(max_new_tokens),
"-c",
str(ctx_size),
"--reasoning",
reasoning,
"--temp",
str(temperature),
"--threads",
str(threads),
"--no-display-prompt",
]
if reasoning_format != "auto":
command.extend(["--reasoning-format", reasoning_format])
if reasoning_budget is not None:
command.extend(["--reasoning-budget", str(reasoning_budget)])
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
encoding="utf-8",
)
if completed.returncode != 0:
detail = completed.stderr.strip() or completed.stdout.strip() or "llama-cli failed"
raise SystemExit(detail)
stdout = completed.stdout.strip()
if stdout:
print(stdout)
def _print_interactive_banner() -> None:
print("LumynaX interactive terminal chat")
print("Type /reset to clear the conversation, or /quit to exit.")
def _run_interactive_llama_cpp_python(
*,
model_path: Path,
system_prompt: str,
max_new_tokens: int,
ctx_size: int,
temperature: float,
threads: int,
opening_prompt: str | None = None,
reasoning: str = "off",
reasoning_format: str = "auto",
reasoning_budget: int | None = None,
) -> None:
from llama_cpp import Llama
llm = Llama(
model_path=str(model_path),
n_ctx=ctx_size,
n_threads=threads,
n_gpu_layers=0,
chat_format="chat_template.default",
verbose=False,
)
transcript: list[tuple[str, str]] = []
_print_interactive_banner()
pending_prompt = opening_prompt.strip() if opening_prompt and opening_prompt.strip() else None
while True:
try:
if pending_prompt is None:
user_prompt = input("You> ").strip()
else:
user_prompt = pending_prompt
print(f"You> {user_prompt}")
pending_prompt = None
except (EOFError, KeyboardInterrupt):
print("\nExiting LumynaX chat.")
return
if not user_prompt:
continue
lowered_prompt = user_prompt.lower()
if lowered_prompt in ('/quit', '/exit'):
print("Exiting LumynaX chat.")
return
if lowered_prompt == "/reset":
transcript.clear()
print("Conversation reset.")
continue
messages: list[dict[str, str]] = [{"role": "system", "content": system_prompt}]
for transcript_user_prompt, transcript_assistant_response in transcript:
messages.append({"role": "user", "content": transcript_user_prompt})
messages.append({"role": "assistant", "content": transcript_assistant_response})
messages.append({"role": "user", "content": user_prompt})
response = llm.create_chat_completion(
messages=messages,
max_tokens=max_new_tokens,
temperature=temperature,
)
assistant_text = _extract_text(response)
print(f"LumynaX> {assistant_text}")
transcript.append((user_prompt, assistant_text))
def _run_interactive_llama_cli(
*,
llama_cli_path: Path,
model_path: Path,
system_prompt: str,
max_new_tokens: int,
ctx_size: int,
temperature: float,
threads: int,
opening_prompt: str | None = None,
reasoning: str = "off",
reasoning_format: str = "auto",
reasoning_budget: int | None = None,
) -> None:
print("LumynaX interactive terminal chat")
print("Interactive mode already uses llama-cli directly. Use Ctrl+C to exit.")
command = [
str(llama_cli_path),
"-m",
str(model_path),
"-sys",
system_prompt,
"-cnv",
"-n",
str(max_new_tokens),
"-c",
str(ctx_size),
"--reasoning",
reasoning,
"--temp",
str(temperature),
"--threads",
str(threads),
"--simple-io",
]
if reasoning_format != "auto":
command.extend(["--reasoning-format", reasoning_format])
if reasoning_budget is not None:
command.extend(["--reasoning-budget", str(reasoning_budget)])
if opening_prompt and opening_prompt.strip():
command.extend(["-p", opening_prompt.strip()])
completed = subprocess.run(command, check=False)
if completed.returncode != 0:
raise SystemExit(completed.returncode)
def main() -> None:
args = _build_parser().parse_args()
root = Path(__file__).resolve().parent
source_model_path = _preferred_gguf(root)
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
single_prompt = (args.prompt or "Say hello in one short sentence.").strip()
system_prompt = args.system_prompt.strip() or (
f"You are LumynaX operating from the {MODEL_TITLE} package identity. "
"Be helpful, clear, and honest about provenance."
)
explicit_cli_requested = bool(
args.llama_cli.strip()
or os.environ.get("LLAMA_CPP_CLI", "").strip()
or os.environ.get("LLAMA_CLI_PATH", "").strip()
)
if args.interactive:
llama_cli_path = _discover_llama_cli(args.llama_cli)
if explicit_cli_requested:
if llama_cli_path is None:
raise SystemExit(
"A llama-cli override was requested, but no usable llama-cli binary was found.",
)
_run_interactive_llama_cli(
llama_cli_path=llama_cli_path,
model_path=_local_model_path(source_model_path, cache_local=args.cache_local),
system_prompt=system_prompt,
opening_prompt=args.prompt,
max_new_tokens=args.max_new_tokens,
ctx_size=args.ctx_size,
temperature=args.temperature,
threads=args.threads,
reasoning=args.reasoning,
reasoning_format=args.reasoning_format,
reasoning_budget=args.reasoning_budget,
)
return
model_path = _local_model_path(source_model_path, cache_local=args.cache_local)
try:
_run_interactive_llama_cpp_python(
model_path=model_path,
system_prompt=system_prompt,
opening_prompt=args.prompt,
max_new_tokens=args.max_new_tokens,
ctx_size=args.ctx_size,
temperature=args.temperature,
threads=args.threads,
reasoning=args.reasoning,
reasoning_format=args.reasoning_format,
reasoning_budget=args.reasoning_budget,
)
return
except Exception as exc: # noqa: BLE001
if llama_cli_path is None:
raise SystemExit(
"llama-cpp-python could not load this GGUF package. "
"Install or point LLAMA_CPP_CLI at llama-cli to use the built-in fallback. "
f"Original error: {exc}",
) from exc
print(
f"llama-cpp-python failed; falling back to llama-cli at {llama_cli_path}",
file=sys.stderr,
)
_run_interactive_llama_cli(
llama_cli_path=llama_cli_path,
model_path=model_path,
system_prompt=system_prompt,
opening_prompt=args.prompt,
max_new_tokens=args.max_new_tokens,
ctx_size=args.ctx_size,
temperature=args.temperature,
threads=args.threads,
reasoning=args.reasoning,
reasoning_format=args.reasoning_format,
reasoning_budget=args.reasoning_budget,
)
return
if explicit_cli_requested:
llama_cli_path = _discover_llama_cli(args.llama_cli)
if llama_cli_path is None:
raise SystemExit(
"A llama-cli override was requested, but no usable llama-cli binary was found.",
)
_run_llama_cli(
llama_cli_path=llama_cli_path,
model_path=_local_model_path(source_model_path, cache_local=args.cache_local),
system_prompt=system_prompt,
user_prompt=single_prompt,
max_new_tokens=args.max_new_tokens,
ctx_size=args.ctx_size,
temperature=args.temperature,
threads=args.threads,
reasoning=args.reasoning,
reasoning_format=args.reasoning_format,
reasoning_budget=args.reasoning_budget,
)
return
model_path = _local_model_path(source_model_path, cache_local=args.cache_local)
try:
print(
_run_llama_cpp_python(
model_path=model_path,
system_prompt=system_prompt,
user_prompt=single_prompt,
max_new_tokens=args.max_new_tokens,
ctx_size=args.ctx_size,
temperature=args.temperature,
threads=args.threads,
),
)
return
except Exception as exc: # noqa: BLE001
llama_cli_path = _discover_llama_cli(args.llama_cli)
if llama_cli_path is None:
raise SystemExit(
"llama-cpp-python could not load this GGUF package. "
"Install or point LLAMA_CPP_CLI at llama-cli to use the built-in fallback. "
f"Original error: {exc}",
) from exc
print(
f"llama-cpp-python failed; falling back to llama-cli at {llama_cli_path}",
file=sys.stderr,
)
_run_llama_cli(
llama_cli_path=llama_cli_path,
model_path=model_path,
system_prompt=system_prompt,
user_prompt=single_prompt,
max_new_tokens=args.max_new_tokens,
ctx_size=args.ctx_size,
temperature=args.temperature,
threads=args.threads,
reasoning=args.reasoning,
reasoning_format=args.reasoning_format,
reasoning_budget=args.reasoning_budget,
)
if __name__ == "__main__":
main()