初始化项目,由ModelHub XC社区提供模型

Model: QuixiAI/Llama-3.2-1B
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-02 19:33:12 +08:00
commit 5f7e251fae
11 changed files with 2493 additions and 0 deletions

50
.gitattributes vendored Normal file
View File

@@ -0,0 +1,50 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.gguf* filter=lfs diff=lfs merge=lfs -text
*.ggml filter=lfs diff=lfs merge=lfs -text
*.llamafile* filter=lfs diff=lfs merge=lfs -text
*.pt2 filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
model.safetensors filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

31
README.md Normal file
View File

@@ -0,0 +1,31 @@
---
base_model: meta-llama/Llama-3.2-1B-Instruct
language:
- en
library_name: transformers
license: llama3.2
tags:
- llama-3
- llama
- meta
- facebook
- transformers
---
Quantizing Llama-3.2-1B
Eric Hartford
I am creating several quants of Llama-3.1-1B for the purposes of testing vLLM Marlin.
- https://huggingface.co/QuixiAI/Llama-3.2-1B
- https://huggingface.co/QuixiAI/Llama-3.2-1B-FP8-Dynamic
- https://huggingface.co/QuixiAI/Llama-3.2-1B-MXFP4
- https://huggingface.co/QuixiAI/Llama-3.2-1B-NVFP4A16
- https://huggingface.co/QuixiAI/Llama-3.2-1B-W4A16-AWQ
- https://huggingface.co/QuixiAI/Llama-3.2-1B-W4A16-GPTQ
- https://huggingface.co/QuixiAI/Llama-3.2-1B-W8A16-GPTQ
The script I used to quant this:
[quant.py](quant.py)

93
chat_template.jinja Normal file
View File

@@ -0,0 +1,93 @@
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
{%- if strftime_now is defined %}
{%- set date_string = strftime_now("%d %b %Y") %}
{%- else %}
{%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "" %}
{%- endif %}
{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
{{- "Environment: ipython\n" }}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- system_message }}
{{- "<|eot_id|>" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
{{- "Given the following functions, please respond with a JSON for a function call " }}
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- first_user_message + "<|eot_id|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- elif 'tool_calls' in message %}
{%- if not message.tool_calls|length == 1 %}
{{- raise_exception("This model only supports single tool-calls at once!") }}
{%- endif %}
{%- set tool_call = message.tool_calls[0].function %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- '{"name": "' + tool_call.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.arguments | tojson }}
{{- "}" }}
{{- "<|eot_id|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
{%- if message.content is mapping or message.content is iterable %}
{{- message.content | tojson }}
{%- else %}
{{- message.content }}
{%- endif %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}

37
config.json Normal file
View File

@@ -0,0 +1,37 @@
{
"architectures": [
"LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": 128009,
"head_dim": 64,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 8192,
"max_position_embeddings": 131072,
"mlp_bias": false,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 16,
"num_key_value_heads": 8,
"pad_token_id": 128004,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": {
"factor": 32.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"rope_theta": 500000.0,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.52.0.dev0",
"unsloth_fixed": true,
"use_cache": true,
"vocab_size": 128256
}

1
configuration.json Normal file
View File

@@ -0,0 +1 @@
{"framework": "pytorch", "task": "text-generation", "allow_remote": true}

14
generation_config.json Normal file
View File

@@ -0,0 +1,14 @@
{
"bos_token_id": 128000,
"do_sample": true,
"eos_token_id": [
128001,
128008,
128009
],
"max_length": 131072,
"pad_token_id": 128004,
"temperature": 0.6,
"top_p": 0.9,
"transformers_version": "4.52.0.dev0"
}

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1ff795ff6a07e6a68085d206fb84417da2f083f68391c2843cd2b8ac6df8538f
size 2471645608

171
quant.py Normal file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""Convert a local BF16 model into Marlin-supported quant formats via llm-compressor."""
from __future__ import annotations
import gc
import os
import sys
from typing import Optional
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# Allow running against the local llm-compressor checkout without installing.
LLM_COMPRESSOR_SRC = "/home/quixi/marlin-cdna/llm-compressor/src"
if os.path.isdir(LLM_COMPRESSOR_SRC):
sys.path.insert(0, LLM_COMPRESSOR_SRC)
from llmcompressor import oneshot # noqa: E402
from llmcompressor.modifiers.awq import AWQModifier # noqa: E402
from llmcompressor.modifiers.quantization import ( # noqa: E402
GPTQModifier,
QuantizationModifier,
)
MODEL_PATH = "/home/quixi/models/Llama-3.2-1B"
OUTPUT_ROOT = "/home/quixi/models"
CALIB_DATASET_ID = "HuggingFaceH4/ultrachat_200k"
CALIB_DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 128
MAX_SEQUENCE_LENGTH = 512
def _load_tokenized_dataset(tokenizer):
ds = load_dataset(
CALIB_DATASET_ID,
split=f"{CALIB_DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
).shuffle(seed=42)
def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds = ds.map(preprocess)
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
return ds.map(tokenize, remove_columns=ds.column_names)
def _load_model_and_tokenizer():
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if torch.cuda.is_available():
model.to("cuda")
return model, tokenizer
def _cleanup(model, tokenizer):
del model
del tokenizer
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def _run_recipe(
name: str,
recipe,
*,
save_compressed: bool,
use_calibration: bool,
) -> Optional[str]:
print(f"\n=== Quantizing {name} ===")
model, tokenizer = _load_model_and_tokenizer()
oneshot_kwargs = {"model": model, "recipe": recipe}
if use_calibration:
ds = _load_tokenized_dataset(tokenizer)
oneshot_kwargs.update(
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
oneshot(**oneshot_kwargs)
base_name = os.path.basename(MODEL_PATH.rstrip("/"))
save_dir = os.path.join(OUTPUT_ROOT, f"{base_name}-{name}")
os.makedirs(save_dir, exist_ok=True)
if save_compressed:
model.save_pretrained(save_dir, save_compressed=True)
else:
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
_cleanup(model, tokenizer)
return save_dir
def main():
# GPTQ W4A16 (INT4 weight-only).
_run_recipe(
"W4A16-GPTQ",
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
save_compressed=True,
use_calibration=True,
)
# AWQ W4A16 (INT4 weight-only).
_run_recipe(
"W4A16-AWQ",
AWQModifier(
targets=["Linear"],
scheme="W4A16_ASYM",
ignore=["lm_head"],
duo_scaling="both",
),
save_compressed=True,
use_calibration=True,
)
# GPTQ W8A16 (INT8 weight-only).
_run_recipe(
"W8A16-GPTQ",
GPTQModifier(targets="Linear", scheme="W8A16", ignore=["lm_head"]),
save_compressed=True,
use_calibration=True,
)
# FP8 dynamic (W8A8-FP8).
_run_recipe(
"FP8-Dynamic",
QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]),
save_compressed=False,
use_calibration=False,
)
# NVFP4A16 (FP4 weights + FP16 activations).
_run_recipe(
"NVFP4A16",
QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]),
save_compressed=True,
use_calibration=False,
)
# MXFP4 (FP4 weights).
_run_recipe(
"MXFP4",
QuantizationModifier(targets="Linear", scheme="MXFP4", ignore=["lm_head"]),
save_compressed=True,
use_calibration=False,
)
if __name__ == "__main__":
main()

23
special_tokens_map.json Normal file
View File

@@ -0,0 +1,23 @@
{
"bos_token": {
"content": "<|begin_of_text|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|eot_id|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|finetune_right_pad_id|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

BIN
tokenizer.json (Stored with Git LFS) Normal file

Binary file not shown.

2067
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff