From 0a89956c717fea8ae410205b8869813b49b3f404 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 20 Jun 2026 03:16:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: abetlen/Phi-3.5-vision-instruct-gguf Source: Original Platform --- .gitattributes | 38 +++ LICENSE | 21 ++ Phi-3.5-3.8B-vision-instruct-F16.gguf | 3 + Phi-3.5-3.8B-vision-instruct-Q8_0.gguf | 3 + Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf | 3 + README.md | 5 + convert_image_gguf.py | 297 +++++++++++++++++++ 7 files changed, 370 insertions(+) create mode 100644 .gitattributes create mode 100644 LICENSE create mode 100644 Phi-3.5-3.8B-vision-instruct-F16.gguf create mode 100644 Phi-3.5-3.8B-vision-instruct-Q8_0.gguf create mode 100644 Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf create mode 100644 README.md create mode 100644 convert_image_gguf.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2e3c008 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +Phi-3.5-3.8B-vision-instruct-F16.gguf filter=lfs diff=lfs merge=lfs -text +Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text +Phi-3.5-3.8B-vision-instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9e841e7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/Phi-3.5-3.8B-vision-instruct-F16.gguf b/Phi-3.5-3.8B-vision-instruct-F16.gguf new file mode 100644 index 0000000..293be42 --- /dev/null +++ b/Phi-3.5-3.8B-vision-instruct-F16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9c42238c6ca2ab31d7fdb4786e8ba5d52bd759cff11e9636b3c81049373be99 +size 7643297312 diff --git a/Phi-3.5-3.8B-vision-instruct-Q8_0.gguf b/Phi-3.5-3.8B-vision-instruct-Q8_0.gguf new file mode 100644 index 0000000..3928651 --- /dev/null +++ b/Phi-3.5-3.8B-vision-instruct-Q8_0.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0a1ee23ea9d88e932b493a4c077dea95c0f52a0f57a604509504c6ebc3df12 +size 4061222432 diff --git a/Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf b/Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf new file mode 100644 index 0000000..3a886c7 --- /dev/null +++ b/Phi-3.5-3.8B-vision-instruct-mmproj-F16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab8449cc7527c21d7082a6ca8266f67a71b459019f67e814ae1683700e61f3f9 +size 652183008 diff --git a/README.md b/README.md new file mode 100644 index 0000000..2da71cd --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +--- +license: mit +--- + +work-in-progress does not currently work with main branch of llama.cpp \ No newline at end of file diff --git a/convert_image_gguf.py b/convert_image_gguf.py new file mode 100644 index 0000000..22d9f65 --- /dev/null +++ b/convert_image_gguf.py @@ -0,0 +1,297 @@ +import os +import typing +import argparse +import numpy as np +import torch +from gguf import * +from safetensors import safe_open + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + +class Args: + def __init__(self, model, output): + self.model = model + self.output = output + +class SafetensorsIndexFile(typing.TypedDict): + weight_map: typing.Dict[str, str] + +class SafetensorsIndex: + def __init__(self, index_file_path: str): + directory = os.path.dirname(index_file_path) + self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path))) + self.weight_map = self.index["weight_map"] + files = set(self.weight_map.values()) + self.tensors = {file: safe_open(os.path.join(directory, file), framework="pt") for file in files} + + def get_tensor(self, key: str) -> npt.NDArray[np.float32]: + # convert to float32 and cast to np array + return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key).to(torch.float32).numpy()) + +def main(): + parser = argparse.ArgumentParser(description="Extract vision model from safetensors to GGUF") + parser.add_argument("--model", type=str, required=True, help="Input safetensors file") + parser.add_argument("--output", type=str, required=True, help="Output GGUF file") + args = parser.parse_args() + + import pathlib + dir_model = pathlib.Path(args.model) + config = json.load(open(dir_model / "config.json")) + + # tensors = safe_open(args.model, framework="np", device="cpu") + tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix()) + + ftype = 1 # fp16 + + # source https://github.com/huggingface/transformers/blob/87134662f73d5e89bb015531ddd1d4662371d317/src/transformers/models/clip/configuration_clip.py#L209 + # hidden_size=768, + # intermediate_size=3072, + # projection_dim=512, + # num_hidden_layers=12, + # num_attention_heads=12, + # num_channels=3, + # image_size=224, + # patch_size=32, + # hidden_act="quick_gelu", + # layer_norm_eps=1e-5, + # attention_dropout=0.0, + # initializer_range=0.02, + # initializer_factor=1.0, + clip_vision_config = { + "hidden_size": 768, + "intermediate_size": 3072, + "projection_dim": 512, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "num_channels": 3, + "image_size": 224, + "patch_size": 32, + "hidden_act": "quick_gelu", + "layer_norm_eps": 1e-5, + "attention_dropout": 0.0, + "initializer_range": 0.02, + "initializer_factor": 1.0, + } + # CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig( + # attention_dropout=0.0, + # dropout=0.0, + # hidden_act="quick_gelu", + # hidden_size=1024, + # image_size=336, + # initializer_factor=1.0, + # initializer_range=0.02, + # intermediate_size=4096, + # layer_norm_eps=1e-05, + # num_attention_heads=16, + # num_channels=3, + # num_hidden_layers=24, + # patch_size=14, + # projection_dim=768 + # ) + clip_vision_config.update(dict( + attention_dropout=0.0, + dropout=0.0, + hidden_act="quick_gelu", + hidden_size=1024, + image_size=336, + initializer_factor=1.0, + initializer_range=0.02, + intermediate_size=4096, + layer_norm_eps=1e-05, + num_attention_heads=16, + num_channels=3, + num_hidden_layers=24, + patch_size=14, + projection_dim=768 + )) + + + + fout = GGUFWriter(args.output, arch="clip") + + fout.add_bool("clip.has_text_encoder", False) + fout.add_bool("clip.has_vision_encoder", True) + fout.add_bool("clip.has_llava_projector", True) + fout.add_file_type(ftype) + + model_name = "microsoft/phi-3.5-vision-instruct" + fout.add_name(model_name) + fout.add_description("image encoder for " + model_name) + fout.add_string("clip.projector_type", "mlp") + + # Vision model hparams + VISION = "clip.vision" + fout.add_uint32("clip.vision.image_size", clip_vision_config["image_size"]) + fout.add_uint32("clip.vision.patch_size", clip_vision_config["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), clip_vision_config["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), clip_vision_config["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", clip_vision_config["projection_dim"]) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), clip_vision_config["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), clip_vision_config["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), clip_vision_config["num_hidden_layers"]) + + fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073]) + fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711]) + + fout.add_bool("clip.use_gelu", clip_vision_config["hidden_act"] != "quick_gelu") + + # Vision model tensors + prefix = "model.vision_embed_tokens.img_processor.vision_model." + + fout.add_tensor( + "v.class_embd", + tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.float32), + ) + fout.add_tensor( + "v.patch_embd.weight", + tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight") + .reshape(clip_vision_config["hidden_size"], 3, clip_vision_config["patch_size"], clip_vision_config["patch_size"]) + .astype(np.float16), + ) + fout.add_tensor( + "v.position_embd.weight", + tensors.get_tensor(f"{prefix}embeddings.position_embedding.weight").astype(np.float16), + ) + + fout.add_tensor( + "v.sub_gn", + tensors.get_tensor("model.vision_embed_tokens.sub_GN").astype(np.float32), + ) + fout.add_tensor( + "v.glb_gn", + tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32), + ) + + fout.add_tensor( + "mm.0.weight", + tensors.get_tensor("model.vision_embed_tokens.img_projection.0.weight").astype(np.float16), + ) + fout.add_tensor( + "mm.0.bias", + tensors.get_tensor("model.vision_embed_tokens.img_projection.0.bias").astype(np.float32), + ) + + fout.add_tensor( + "mm.2.weight", + tensors.get_tensor("model.vision_embed_tokens.img_projection.2.weight").astype(np.float16), + ) + fout.add_tensor( + "mm.2.bias", + tensors.get_tensor("model.vision_embed_tokens.img_projection.2.bias").astype(np.float32), + ) + + for i in range(clip_vision_config["num_hidden_layers"]): + # attention norm + fout.add_tensor( + f"v.blk.{i}.attn_norm.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.attn_norm.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.ffn_norm.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.ffn_norm.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32), + ) + + # feed forward + fout.add_tensor( + f"v.blk.{i}.ffn_down.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16), + ) + fout.add_tensor( + f"v.blk.{i}.ffn_down.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.ffn_up.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16), + ) + fout.add_tensor( + f"v.blk.{i}.ffn_up.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.float32), + ) + + # attention + fout.add_tensor( + f"v.blk.{i}.attn_k.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16), + ) + fout.add_tensor( + f"v.blk.{i}.attn_k.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.attn_out.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16), + ) + fout.add_tensor( + f"v.blk.{i}.attn_out.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.attn_q.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16), + ) + fout.add_tensor( + f"v.blk.{i}.attn_q.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.attn_v.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16), + ) + fout.add_tensor( + f"v.blk.{i}.attn_v.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.float32), + ) + + # layer norm + fout.add_tensor( + f"v.blk.{i}.ln1.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.ln1.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.ln2.weight", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32), + ) + fout.add_tensor( + f"v.blk.{i}.ln2.bias", + tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32), + ) + + fout.add_tensor( + "v.post_ln.weight", + tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32), + ) + fout.add_tensor( + "v.post_ln.bias", + tensors.get_tensor(f"{prefix}post_layernorm.bias").astype(np.float32), + ) + + fout.add_tensor( + "v.pre_ln.weight", + tensors.get_tensor(f"{prefix}pre_layrnorm.weight").astype(np.float32), + ) + fout.add_tensor( + "v.pre_ln.bias", + tensors.get_tensor(f"{prefix}pre_layrnorm.bias").astype(np.float32), + ) + + fout.write_header_to_file() + fout.write_kv_data_to_file() + fout.write_tensors_to_file() + fout.close() + +if __name__ == "__main__": + main()