初始化项目,由ModelHub XC社区提供模型
Model: North-ML1/willow-alpha Source: Original Platform
This commit is contained in:
21
vision_adapter/README.md
Normal file
21
vision_adapter/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
---
|
||||
license: mit
|
||||
---
|
||||
|
||||
# Forge-1V Micro Vision Adapter Scaffold
|
||||
|
||||
This folder contains an experimental, untrained micro vision-adapter scaffold for future Forge-1V work.
|
||||
|
||||
It is intentionally separate from the main text checkpoint:
|
||||
|
||||
- The main `config.json` remains Llama-compatible.
|
||||
- The GGUF export remains text-only.
|
||||
- These files do not make the released model able to view images.
|
||||
|
||||
Suggested target design:
|
||||
|
||||
- Tiny patch encoder: 3-channel images to a small vision width.
|
||||
- Projection: vision width to the 1024-dimensional Forge text hidden size.
|
||||
- Prefix tokens: projected visual tokens can be prepended to the text sequence in a future custom multimodal training run.
|
||||
|
||||
Approximate extra parameters for the scaffold design are well under 1M, keeping the total system under 400M parameters.
|
||||
8
vision_adapter/config.json
Normal file
8
vision_adapter/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"image_size": 224,
|
||||
"patch_size": 16,
|
||||
"vision_width": 128,
|
||||
"text_hidden_size": 1024,
|
||||
"num_prefix_tokens": 16,
|
||||
"status": "experimental_untrained_scaffold"
|
||||
}
|
||||
38
vision_adapter/forge_micro_vision.py
Normal file
38
vision_adapter/forge_micro_vision.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
@dataclass
|
||||
class ForgeMicroVisionConfig:
|
||||
image_size: int = 224
|
||||
patch_size: int = 16
|
||||
vision_width: int = 128
|
||||
text_hidden_size: int = 1024
|
||||
num_prefix_tokens: int = 16
|
||||
|
||||
|
||||
class ForgeMicroVisionAdapter(nn.Module):
|
||||
"""Tiny untrained image-to-prefix adapter scaffold for Forge-1V experiments."""
|
||||
|
||||
def __init__(self, config: ForgeMicroVisionConfig = ForgeMicroVisionConfig()):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.patch_embed = nn.Conv2d(
|
||||
3,
|
||||
config.vision_width,
|
||||
kernel_size=config.patch_size,
|
||||
stride=config.patch_size,
|
||||
bias=False,
|
||||
)
|
||||
self.pool = nn.AdaptiveAvgPool1d(config.num_prefix_tokens)
|
||||
self.norm = nn.LayerNorm(config.vision_width)
|
||||
self.proj = nn.Linear(config.vision_width, config.text_hidden_size, bias=False)
|
||||
|
||||
def forward(self, images: torch.Tensor) -> torch.Tensor:
|
||||
patches = self.patch_embed(images)
|
||||
tokens = patches.flatten(2)
|
||||
tokens = self.pool(tokens).transpose(1, 2)
|
||||
tokens = self.norm(tokens)
|
||||
return self.proj(tokens)
|
||||
21
vision_adapter/vision_adapter/README.md
Normal file
21
vision_adapter/vision_adapter/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
---
|
||||
license: mit
|
||||
---
|
||||
|
||||
# Forge-1V Micro Vision Adapter Scaffold
|
||||
|
||||
This folder contains an experimental, untrained micro vision-adapter scaffold for future Forge-1V work.
|
||||
|
||||
It is intentionally separate from the main text checkpoint:
|
||||
|
||||
- The main `config.json` remains Llama-compatible.
|
||||
- The GGUF export remains text-only.
|
||||
- These files do not make the released model able to view images.
|
||||
|
||||
Suggested target design:
|
||||
|
||||
- Tiny patch encoder: 3-channel images to a small vision width.
|
||||
- Projection: vision width to the 1024-dimensional Forge text hidden size.
|
||||
- Prefix tokens: projected visual tokens can be prepended to the text sequence in a future custom multimodal training run.
|
||||
|
||||
Approximate extra parameters for the scaffold design are well under 1M, keeping the total system under 400M parameters.
|
||||
8
vision_adapter/vision_adapter/config.json
Normal file
8
vision_adapter/vision_adapter/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"image_size": 224,
|
||||
"patch_size": 16,
|
||||
"vision_width": 128,
|
||||
"text_hidden_size": 1024,
|
||||
"num_prefix_tokens": 16,
|
||||
"status": "experimental_untrained_scaffold"
|
||||
}
|
||||
38
vision_adapter/vision_adapter/forge_micro_vision.py
Normal file
38
vision_adapter/vision_adapter/forge_micro_vision.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
@dataclass
|
||||
class ForgeMicroVisionConfig:
|
||||
image_size: int = 224
|
||||
patch_size: int = 16
|
||||
vision_width: int = 128
|
||||
text_hidden_size: int = 1024
|
||||
num_prefix_tokens: int = 16
|
||||
|
||||
|
||||
class ForgeMicroVisionAdapter(nn.Module):
|
||||
"""Tiny untrained image-to-prefix adapter scaffold for Forge-1V experiments."""
|
||||
|
||||
def __init__(self, config: ForgeMicroVisionConfig = ForgeMicroVisionConfig()):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.patch_embed = nn.Conv2d(
|
||||
3,
|
||||
config.vision_width,
|
||||
kernel_size=config.patch_size,
|
||||
stride=config.patch_size,
|
||||
bias=False,
|
||||
)
|
||||
self.pool = nn.AdaptiveAvgPool1d(config.num_prefix_tokens)
|
||||
self.norm = nn.LayerNorm(config.vision_width)
|
||||
self.proj = nn.Linear(config.vision_width, config.text_hidden_size, bias=False)
|
||||
|
||||
def forward(self, images: torch.Tensor) -> torch.Tensor:
|
||||
patches = self.patch_embed(images)
|
||||
tokens = patches.flatten(2)
|
||||
tokens = self.pool(tokens).transpose(1, 2)
|
||||
tokens = self.norm(tokens)
|
||||
return self.proj(tokens)
|
||||
Reference in New Issue
Block a user