初始化项目,由ModelHub XC社区提供模型

Model: ciCic/llama-3.2-1B-Instruct-AWQ
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-08 20:40:53 +08:00
commit 18fa0096b0
8 changed files with 2325 additions and 0 deletions

36
.gitattributes vendored Normal file
View File

@@ -0,0 +1,36 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

145
README.md Normal file
View File

@@ -0,0 +1,145 @@
---
language:
- en
- de
- fr
- it
- pt
- hi
- es
- th
library_name: transformers
pipeline_tag: text-generation
tags:
- facebook
- meta
- pytorch
- llama-3
license: llama3.2
base_model:
- meta-llama/Llama-3.2-1B-Instruct
---
# Represents
A quantized version of Llama 3.2 1B Instruct with Activation-aware Weight Quantization (AWQ)[https://github.com/mit-han-lab/llm-awq]
## Use with transformers/autoawq
Starting with
- `transformers==4.45.1`
- `accelerate==0.34.2`
- `torch==2.3.1`
- `numpy==2.0.0`
- `autoawq==0.2.6`
Experimented with
- OS = Windows
- GPU = Nvidia GeForce RTX 3080 10gb
- CPU = Intel Core i5-9600K
- RAM = 32GB
### For CUDA users
**AutoAWQ**
NOTE: this example uses `fuse_layers=True` to fuse attention and mlp layers together for faster inference
```python
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer
quant_id = "ciCic/llama-3.2-1B-Instruct-AWQ"
model = AutoAWQForCausalLM.from_quantized(quant_id, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_id, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Declare prompt
prompt = "You're standing on the surface of the Earth. "\
"You walk one mile south, one mile west and one mile north. "\
"You end up exactly where you started. Where are you?"
# Tokenization of the prompt
tokens = tokenizer(
prompt,
return_tensors='pt'
).input_ids.cuda()
# Generate output in a streaming fashion
generation_output = model.generate(
tokens,
streamer=streamer,
max_new_tokens=512
)
```
**Transformers**
```python
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
import torch
quant_id = "ciCic/llama-3.2-1B-Instruct-AWQ"
tokenizer = AutoTokenizer.from_pretrained(quant_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
quant_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="cuda"
)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Convert prompt to tokens
prompt = "You're standing on the surface of the Earth. "\
"You walk one mile south, one mile west and one mile north. "\
"You end up exactly where you started. Where are you?"
tokens = tokenizer(
prompt,
return_tensors='pt'
).input_ids.cuda()
# Generate output
generation_output = model.generate(
tokens,
streamer=streamer,
max_new_tokens=512
)
```
#### Issue/Solution
- torch.from_numpy fails
- This might be due to certain issues within `torch==2.3.1` .cpp files. Since AutoAWQ uses torch version 2.3.1, instead of most recent, this issue might occur within module `marlin.py -> def _get_perms()`
- Module path: Python\Python311\site-packages\awq\modules\linear\marlin.py
- Solution:
- there are several operations to numpy (cpu) then back to tensor (gpu) which could be completely replaced by tensor without having to use numpy, this will solve (temporarily) the from_numpy() issue
```python
def _get_perms():
perm = []
for i in range(32):
perm1 = []
col = i // 4
for block in [0, 1]:
for row in [
2 * (i % 4),
2 * (i % 4) + 1,
2 * (i % 4 + 4),
2 * (i % 4 + 4) + 1,
]:
perm1.append(16 * row + col + 8 * block)
for j in range(4):
perm.extend([p + 256 * j for p in perm1])
# perm = np.array(perm)
perm = torch.asarray(perm)
# interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
interleave = torch.asarray([0, 2, 4, 6, 1, 3, 5, 7])
perm = perm.reshape((-1, 8))[:, interleave].ravel()
# perm = torch.from_numpy(perm)
scale_perm = []
for i in range(8):
scale_perm.extend([i + 8 * j for j in range(8)])
scale_perm_single = []
for i in range(4):
scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
return perm, scale_perm, scale_perm_single
```

48
config.json Normal file
View File

@@ -0,0 +1,48 @@
{
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
"architectures": [
"LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 128000,
"eos_token_id": [
128001,
128008,
128009
],
"head_dim": 64,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 8192,
"max_position_embeddings": 131072,
"mlp_bias": false,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 16,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"quantization_config": {
"bits": 4,
"group_size": 128,
"modules_to_not_convert": null,
"quant_method": "awq",
"version": "gemm",
"zero_point": true
},
"rms_norm_eps": 1e-05,
"rope_scaling": {
"factor": 32.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"rope_theta": 500000.0,
"tie_word_embeddings": true,
"torch_dtype": "float16",
"transformers_version": "4.45.1",
"use_cache": false,
"vocab_size": 128256
}

12
generation_config.json Normal file
View File

@@ -0,0 +1,12 @@
{
"bos_token_id": 128000,
"do_sample": true,
"eos_token_id": [
128001,
128008,
128009
],
"temperature": 0.6,
"top_p": 0.9,
"transformers_version": "4.45.1"
}

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:754a7a3c9e43ed5a55175d379322f8462d82cb2d5c29676e2caf1c69d58931c6
size 1556394472

16
special_tokens_map.json Normal file
View File

@@ -0,0 +1,16 @@
{
"bos_token": {
"content": "<|begin_of_text|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|eot_id|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

BIN
tokenizer.json (Stored with Git LFS) Normal file

Binary file not shown.

2062
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff