初始化项目，由ModelHub XC社区提供模型

Model: Mathieu-Thomas-JOSSET/joke-finetome-model-gguf-phi4-20260112-081758 Source: Original Platform
2026-04-11 12:30:59 +08:00
commit 2c2240df42
13 changed files with 4725 additions and 0 deletions
--- a/inference/endpoint_recipe.json
+++ b/inference/endpoint_recipe.json
@@ -0,0 +1,18 @@
+{
+  "engine": "llama.cpp",
+  "recommended_endpoint_settings": {
+    "max_tokens_per_request": 1024,
+    "max_concurrent_requests": 2,
+    "notes": "Memory scales roughly with (max_concurrent_requests * max_tokens_per_request)."
+  },
+  "recommended_generation_defaults": {
+    "temperature": 1.2,
+    "top_p": 0.95,
+    "min_p": 0.05,
+    "repeat_penalty": 1.08,
+    "max_tokens": 2560
+  },
+  "chat_template": "phi4",
+  "gguf_file": "",
+  "gguf_quant": "q8_0"
+}
--- a/inference/hf_endpoint_client.py
+++ b/inference/hf_endpoint_client.py
@@ -0,0 +1,23 @@
+import os
+from huggingface_hub import InferenceClient
+
+# Required env vars:
+#   export HF_TOKEN="..."
+#   export HF_ENDPOINT_BASE_URL="https://xxxx.endpoints.huggingface.cloud"
+
+client = InferenceClient(
+    base_url=os.environ["HF_ENDPOINT_BASE_URL"],
+    api_key=os.environ["HF_TOKEN"],
+)
+
+resp = client.chat.completions.create(
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Write a short joke in the style of The Office."},
+    ],
+    max_tokens=2560,
+    temperature=1.2,
+    top_p=0.95,
+)
+
+print(resp.choices[0].message.content)
--- a/inference/llama_cli_examples.md
+++ b/inference/llama_cli_examples.md
@@ -0,0 +1,12 @@
+### Local inference (llama.cpp)
+
+```bash
+llama-cli -hf {REPO_ID}:q8_0 -cnv --chat-template phi4
+```
+
+### Server (OpenAI-compatible)
+
+```bash
+llama-server -hf {REPO_ID}:q8_0
+# /v1/chat/completions will be available (OpenAI-compatible)
+```