初始化项目,由ModelHub XC社区提供模型
Model: ciCic/llama-3.2-1B-Instruct-AWQ Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
145
README.md
Normal file
145
README.md
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
---
|
||||||
|
language:
|
||||||
|
- en
|
||||||
|
- de
|
||||||
|
- fr
|
||||||
|
- it
|
||||||
|
- pt
|
||||||
|
- hi
|
||||||
|
- es
|
||||||
|
- th
|
||||||
|
library_name: transformers
|
||||||
|
pipeline_tag: text-generation
|
||||||
|
tags:
|
||||||
|
- facebook
|
||||||
|
- meta
|
||||||
|
- pytorch
|
||||||
|
- llama-3
|
||||||
|
license: llama3.2
|
||||||
|
base_model:
|
||||||
|
- meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
---
|
||||||
|
# Represents
|
||||||
|
A quantized version of Llama 3.2 1B Instruct with Activation-aware Weight Quantization (AWQ)[https://github.com/mit-han-lab/llm-awq]
|
||||||
|
|
||||||
|
## Use with transformers/autoawq
|
||||||
|
Starting with
|
||||||
|
- `transformers==4.45.1`
|
||||||
|
- `accelerate==0.34.2`
|
||||||
|
- `torch==2.3.1`
|
||||||
|
- `numpy==2.0.0`
|
||||||
|
- `autoawq==0.2.6`
|
||||||
|
|
||||||
|
Experimented with
|
||||||
|
- OS = Windows
|
||||||
|
- GPU = Nvidia GeForce RTX 3080 10gb
|
||||||
|
- CPU = Intel Core i5-9600K
|
||||||
|
- RAM = 32GB
|
||||||
|
|
||||||
|
### For CUDA users
|
||||||
|
|
||||||
|
**AutoAWQ**
|
||||||
|
|
||||||
|
NOTE: this example uses `fuse_layers=True` to fuse attention and mlp layers together for faster inference
|
||||||
|
```python
|
||||||
|
from awq import AutoAWQForCausalLM
|
||||||
|
from transformers import AutoTokenizer, TextStreamer
|
||||||
|
|
||||||
|
quant_id = "ciCic/llama-3.2-1B-Instruct-AWQ"
|
||||||
|
model = AutoAWQForCausalLM.from_quantized(quant_id, fuse_layers=True)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(quant_id, trust_remote_code=True)
|
||||||
|
|
||||||
|
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
||||||
|
|
||||||
|
# Declare prompt
|
||||||
|
prompt = "You're standing on the surface of the Earth. "\
|
||||||
|
"You walk one mile south, one mile west and one mile north. "\
|
||||||
|
"You end up exactly where you started. Where are you?"
|
||||||
|
|
||||||
|
# Tokenization of the prompt
|
||||||
|
tokens = tokenizer(
|
||||||
|
prompt,
|
||||||
|
return_tensors='pt'
|
||||||
|
).input_ids.cuda()
|
||||||
|
|
||||||
|
# Generate output in a streaming fashion
|
||||||
|
generation_output = model.generate(
|
||||||
|
tokens,
|
||||||
|
streamer=streamer,
|
||||||
|
max_new_tokens=512
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Transformers**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
|
||||||
|
import torch
|
||||||
|
|
||||||
|
quant_id = "ciCic/llama-3.2-1B-Instruct-AWQ"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(quant_id, trust_remote_code=True)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
quant_id,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
device_map="cuda"
|
||||||
|
)
|
||||||
|
|
||||||
|
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
||||||
|
|
||||||
|
# Convert prompt to tokens
|
||||||
|
prompt = "You're standing on the surface of the Earth. "\
|
||||||
|
"You walk one mile south, one mile west and one mile north. "\
|
||||||
|
"You end up exactly where you started. Where are you?"
|
||||||
|
|
||||||
|
tokens = tokenizer(
|
||||||
|
prompt,
|
||||||
|
return_tensors='pt'
|
||||||
|
).input_ids.cuda()
|
||||||
|
|
||||||
|
# Generate output
|
||||||
|
generation_output = model.generate(
|
||||||
|
tokens,
|
||||||
|
streamer=streamer,
|
||||||
|
max_new_tokens=512
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Issue/Solution
|
||||||
|
- torch.from_numpy fails
|
||||||
|
- This might be due to certain issues within `torch==2.3.1` .cpp files. Since AutoAWQ uses torch version 2.3.1, instead of most recent, this issue might occur within module `marlin.py -> def _get_perms()`
|
||||||
|
- Module path: Python\Python311\site-packages\awq\modules\linear\marlin.py
|
||||||
|
- Solution:
|
||||||
|
- there are several operations to numpy (cpu) then back to tensor (gpu) which could be completely replaced by tensor without having to use numpy, this will solve (temporarily) the from_numpy() issue
|
||||||
|
```python
|
||||||
|
def _get_perms():
|
||||||
|
perm = []
|
||||||
|
for i in range(32):
|
||||||
|
perm1 = []
|
||||||
|
col = i // 4
|
||||||
|
for block in [0, 1]:
|
||||||
|
for row in [
|
||||||
|
2 * (i % 4),
|
||||||
|
2 * (i % 4) + 1,
|
||||||
|
2 * (i % 4 + 4),
|
||||||
|
2 * (i % 4 + 4) + 1,
|
||||||
|
]:
|
||||||
|
perm1.append(16 * row + col + 8 * block)
|
||||||
|
|
||||||
|
for j in range(4):
|
||||||
|
perm.extend([p + 256 * j for p in perm1])
|
||||||
|
|
||||||
|
# perm = np.array(perm)
|
||||||
|
perm = torch.asarray(perm)
|
||||||
|
# interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
|
||||||
|
interleave = torch.asarray([0, 2, 4, 6, 1, 3, 5, 7])
|
||||||
|
perm = perm.reshape((-1, 8))[:, interleave].ravel()
|
||||||
|
# perm = torch.from_numpy(perm)
|
||||||
|
scale_perm = []
|
||||||
|
for i in range(8):
|
||||||
|
scale_perm.extend([i + 8 * j for j in range(8)])
|
||||||
|
scale_perm_single = []
|
||||||
|
for i in range(4):
|
||||||
|
scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
||||||
|
return perm, scale_perm, scale_perm_single
|
||||||
|
```
|
||||||
48
config.json
Normal file
48
config.json
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
{
|
||||||
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 128000,
|
||||||
|
"eos_token_id": [
|
||||||
|
128001,
|
||||||
|
128008,
|
||||||
|
128009
|
||||||
|
],
|
||||||
|
"head_dim": 64,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 2048,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 8192,
|
||||||
|
"max_position_embeddings": 131072,
|
||||||
|
"mlp_bias": false,
|
||||||
|
"model_type": "llama",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 16,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"quantization_config": {
|
||||||
|
"bits": 4,
|
||||||
|
"group_size": 128,
|
||||||
|
"modules_to_not_convert": null,
|
||||||
|
"quant_method": "awq",
|
||||||
|
"version": "gemm",
|
||||||
|
"zero_point": true
|
||||||
|
},
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": {
|
||||||
|
"factor": 32.0,
|
||||||
|
"high_freq_factor": 4.0,
|
||||||
|
"low_freq_factor": 1.0,
|
||||||
|
"original_max_position_embeddings": 8192,
|
||||||
|
"rope_type": "llama3"
|
||||||
|
},
|
||||||
|
"rope_theta": 500000.0,
|
||||||
|
"tie_word_embeddings": true,
|
||||||
|
"torch_dtype": "float16",
|
||||||
|
"transformers_version": "4.45.1",
|
||||||
|
"use_cache": false,
|
||||||
|
"vocab_size": 128256
|
||||||
|
}
|
||||||
12
generation_config.json
Normal file
12
generation_config.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"bos_token_id": 128000,
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": [
|
||||||
|
128001,
|
||||||
|
128008,
|
||||||
|
128009
|
||||||
|
],
|
||||||
|
"temperature": 0.6,
|
||||||
|
"top_p": 0.9,
|
||||||
|
"transformers_version": "4.45.1"
|
||||||
|
}
|
||||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:754a7a3c9e43ed5a55175d379322f8462d82cb2d5c29676e2caf1c69d58931c6
|
||||||
|
size 1556394472
|
||||||
16
special_tokens_map.json
Normal file
16
special_tokens_map.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|begin_of_text|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|eot_id|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
tokenizer.json
(Stored with Git LFS)
Normal file
BIN
tokenizer.json
(Stored with Git LFS)
Normal file
Binary file not shown.
2062
tokenizer_config.json
Normal file
2062
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user