18 lines
512 B
Markdown
18 lines
512 B
Markdown
---
|
|
license: mit
|
|
---
|
|
|
|
Llama 2 7B quantized in 2-bit with GPTQ.
|
|
|
|
```
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
from optimum.gptq import GPTQQuantizer
|
|
import torch
|
|
w = 2
|
|
model_path = meta-llama/Llama-2-7b-hf
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
|
|
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
|
|
quantizer = GPTQQuantizer(bits=w, dataset="c4", model_seqlen = 4096)
|
|
quantized_model = quantizer.quantize_model(model, tokenizer)
|
|
``` |