初始化项目,由ModelHub XC社区提供模型
Model: abetlen/Phi-3.5-vision-instruct-gguf Source: Original Platform
This commit is contained in:
38
.gitattributes
vendored
Normal file
38
.gitattributes
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
Phi-3.5-3.8B-vision-instruct-F16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
Phi-3.5-3.8B-vision-instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) Microsoft Corporation.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
||||
3
Phi-3.5-3.8B-vision-instruct-F16.gguf
Normal file
3
Phi-3.5-3.8B-vision-instruct-F16.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f9c42238c6ca2ab31d7fdb4786e8ba5d52bd759cff11e9636b3c81049373be99
|
||||
size 7643297312
|
||||
3
Phi-3.5-3.8B-vision-instruct-Q8_0.gguf
Normal file
3
Phi-3.5-3.8B-vision-instruct-Q8_0.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ad0a1ee23ea9d88e932b493a4c077dea95c0f52a0f57a604509504c6ebc3df12
|
||||
size 4061222432
|
||||
3
Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf
Normal file
3
Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ab8449cc7527c21d7082a6ca8266f67a71b459019f67e814ae1683700e61f3f9
|
||||
size 652183008
|
||||
5
README.md
Normal file
5
README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
license: mit
|
||||
---
|
||||
|
||||
work-in-progress does not currently work with main branch of llama.cpp
|
||||
297
convert_image_gguf.py
Normal file
297
convert_image_gguf.py
Normal file
@@ -0,0 +1,297 @@
|
||||
import os
|
||||
import typing
|
||||
import argparse
|
||||
import numpy as np
|
||||
import torch
|
||||
from gguf import *
|
||||
from safetensors import safe_open
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
class Args:
|
||||
def __init__(self, model, output):
|
||||
self.model = model
|
||||
self.output = output
|
||||
|
||||
class SafetensorsIndexFile(typing.TypedDict):
|
||||
weight_map: typing.Dict[str, str]
|
||||
|
||||
class SafetensorsIndex:
|
||||
def __init__(self, index_file_path: str):
|
||||
directory = os.path.dirname(index_file_path)
|
||||
self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path)))
|
||||
self.weight_map = self.index["weight_map"]
|
||||
files = set(self.weight_map.values())
|
||||
self.tensors = {file: safe_open(os.path.join(directory, file), framework="pt") for file in files}
|
||||
|
||||
def get_tensor(self, key: str) -> npt.NDArray[np.float32]:
|
||||
# convert to float32 and cast to np array
|
||||
return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key).to(torch.float32).numpy())
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Extract vision model from safetensors to GGUF")
|
||||
parser.add_argument("--model", type=str, required=True, help="Input safetensors file")
|
||||
parser.add_argument("--output", type=str, required=True, help="Output GGUF file")
|
||||
args = parser.parse_args()
|
||||
|
||||
import pathlib
|
||||
dir_model = pathlib.Path(args.model)
|
||||
config = json.load(open(dir_model / "config.json"))
|
||||
|
||||
# tensors = safe_open(args.model, framework="np", device="cpu")
|
||||
tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix())
|
||||
|
||||
ftype = 1 # fp16
|
||||
|
||||
# source https://github.com/huggingface/transformers/blob/87134662f73d5e89bb015531ddd1d4662371d317/src/transformers/models/clip/configuration_clip.py#L209
|
||||
# hidden_size=768,
|
||||
# intermediate_size=3072,
|
||||
# projection_dim=512,
|
||||
# num_hidden_layers=12,
|
||||
# num_attention_heads=12,
|
||||
# num_channels=3,
|
||||
# image_size=224,
|
||||
# patch_size=32,
|
||||
# hidden_act="quick_gelu",
|
||||
# layer_norm_eps=1e-5,
|
||||
# attention_dropout=0.0,
|
||||
# initializer_range=0.02,
|
||||
# initializer_factor=1.0,
|
||||
clip_vision_config = {
|
||||
"hidden_size": 768,
|
||||
"intermediate_size": 3072,
|
||||
"projection_dim": 512,
|
||||
"num_hidden_layers": 12,
|
||||
"num_attention_heads": 12,
|
||||
"num_channels": 3,
|
||||
"image_size": 224,
|
||||
"patch_size": 32,
|
||||
"hidden_act": "quick_gelu",
|
||||
"layer_norm_eps": 1e-5,
|
||||
"attention_dropout": 0.0,
|
||||
"initializer_range": 0.02,
|
||||
"initializer_factor": 1.0,
|
||||
}
|
||||
# CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
|
||||
# attention_dropout=0.0,
|
||||
# dropout=0.0,
|
||||
# hidden_act="quick_gelu",
|
||||
# hidden_size=1024,
|
||||
# image_size=336,
|
||||
# initializer_factor=1.0,
|
||||
# initializer_range=0.02,
|
||||
# intermediate_size=4096,
|
||||
# layer_norm_eps=1e-05,
|
||||
# num_attention_heads=16,
|
||||
# num_channels=3,
|
||||
# num_hidden_layers=24,
|
||||
# patch_size=14,
|
||||
# projection_dim=768
|
||||
# )
|
||||
clip_vision_config.update(dict(
|
||||
attention_dropout=0.0,
|
||||
dropout=0.0,
|
||||
hidden_act="quick_gelu",
|
||||
hidden_size=1024,
|
||||
image_size=336,
|
||||
initializer_factor=1.0,
|
||||
initializer_range=0.02,
|
||||
intermediate_size=4096,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
num_hidden_layers=24,
|
||||
patch_size=14,
|
||||
projection_dim=768
|
||||
))
|
||||
|
||||
|
||||
|
||||
fout = GGUFWriter(args.output, arch="clip")
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", False)
|
||||
fout.add_bool("clip.has_vision_encoder", True)
|
||||
fout.add_bool("clip.has_llava_projector", True)
|
||||
fout.add_file_type(ftype)
|
||||
|
||||
model_name = "microsoft/phi-3.5-vision-instruct"
|
||||
fout.add_name(model_name)
|
||||
fout.add_description("image encoder for " + model_name)
|
||||
fout.add_string("clip.projector_type", "mlp")
|
||||
|
||||
# Vision model hparams
|
||||
VISION = "clip.vision"
|
||||
fout.add_uint32("clip.vision.image_size", clip_vision_config["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", clip_vision_config["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), clip_vision_config["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), clip_vision_config["intermediate_size"])
|
||||
fout.add_uint32("clip.vision.projection_dim", clip_vision_config["projection_dim"])
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), clip_vision_config["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), clip_vision_config["layer_norm_eps"])
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), clip_vision_config["num_hidden_layers"])
|
||||
|
||||
fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073])
|
||||
fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711])
|
||||
|
||||
fout.add_bool("clip.use_gelu", clip_vision_config["hidden_act"] != "quick_gelu")
|
||||
|
||||
# Vision model tensors
|
||||
prefix = "model.vision_embed_tokens.img_processor.vision_model."
|
||||
|
||||
fout.add_tensor(
|
||||
"v.class_embd",
|
||||
tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"v.patch_embd.weight",
|
||||
tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight")
|
||||
.reshape(clip_vision_config["hidden_size"], 3, clip_vision_config["patch_size"], clip_vision_config["patch_size"])
|
||||
.astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"v.position_embd.weight",
|
||||
tensors.get_tensor(f"{prefix}embeddings.position_embedding.weight").astype(np.float16),
|
||||
)
|
||||
|
||||
fout.add_tensor(
|
||||
"v.sub_gn",
|
||||
tensors.get_tensor("model.vision_embed_tokens.sub_GN").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"v.glb_gn",
|
||||
tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32),
|
||||
)
|
||||
|
||||
fout.add_tensor(
|
||||
"mm.0.weight",
|
||||
tensors.get_tensor("model.vision_embed_tokens.img_projection.0.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"mm.0.bias",
|
||||
tensors.get_tensor("model.vision_embed_tokens.img_projection.0.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
fout.add_tensor(
|
||||
"mm.2.weight",
|
||||
tensors.get_tensor("model.vision_embed_tokens.img_projection.2.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"mm.2.bias",
|
||||
tensors.get_tensor("model.vision_embed_tokens.img_projection.2.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
for i in range(clip_vision_config["num_hidden_layers"]):
|
||||
# attention norm
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_norm.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_norm.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ffn_norm.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ffn_norm.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
# feed forward
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ffn_down.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ffn_down.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ffn_up.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ffn_up.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
# attention
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_k.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_k.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_out.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_out.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_q.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_q.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_v.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.attn_v.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
# layer norm
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ln1.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ln1.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ln2.weight",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
f"v.blk.{i}.ln2.bias",
|
||||
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
fout.add_tensor(
|
||||
"v.post_ln.weight",
|
||||
tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"v.post_ln.bias",
|
||||
tensors.get_tensor(f"{prefix}post_layernorm.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
fout.add_tensor(
|
||||
"v.pre_ln.weight",
|
||||
tensors.get_tensor(f"{prefix}pre_layrnorm.weight").astype(np.float32),
|
||||
)
|
||||
fout.add_tensor(
|
||||
"v.pre_ln.bias",
|
||||
tensors.get_tensor(f"{prefix}pre_layrnorm.bias").astype(np.float32),
|
||||
)
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user