初始化项目,由ModelHub XC社区提供模型

Model: Mathieu-Thomas-JOSSET/joke-finetome-model-gguf-phi4-20260112-081758
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-04-11 12:30:59 +08:00
commit 2c2240df42
13 changed files with 4725 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
{
"engine": "llama.cpp",
"recommended_endpoint_settings": {
"max_tokens_per_request": 1024,
"max_concurrent_requests": 2,
"notes": "Memory scales roughly with (max_concurrent_requests * max_tokens_per_request)."
},
"recommended_generation_defaults": {
"temperature": 1.2,
"top_p": 0.95,
"min_p": 0.05,
"repeat_penalty": 1.08,
"max_tokens": 2560
},
"chat_template": "phi4",
"gguf_file": "",
"gguf_quant": "q8_0"
}

View File

@@ -0,0 +1,23 @@
import os
from huggingface_hub import InferenceClient
# Required env vars:
# export HF_TOKEN="..."
# export HF_ENDPOINT_BASE_URL="https://xxxx.endpoints.huggingface.cloud"
client = InferenceClient(
base_url=os.environ["HF_ENDPOINT_BASE_URL"],
api_key=os.environ["HF_TOKEN"],
)
resp = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a short joke in the style of The Office."},
],
max_tokens=2560,
temperature=1.2,
top_p=0.95,
)
print(resp.choices[0].message.content)

View File

@@ -0,0 +1,12 @@
### Local inference (llama.cpp)
```bash
llama-cli -hf {REPO_ID}:q8_0 -cnv --chat-template phi4
```
### Server (OpenAI-compatible)
```bash
llama-server -hf {REPO_ID}:q8_0
# /v1/chat/completions will be available (OpenAI-compatible)
```