初始化项目，由ModelHub XC社区提供模型

Model: North-ML1/willow-alpha Source: Original Platform
2026-06-10 15:02:23 +08:00
commit 1767ed14d9
14 changed files with 81526 additions and 0 deletions
--- a/vision_adapter/README.md
+++ b/vision_adapter/README.md
@@ -0,0 +1,21 @@
+---
+license: mit
+---
+
+# Forge-1V Micro Vision Adapter Scaffold
+
+This folder contains an experimental, untrained micro vision-adapter scaffold for future Forge-1V work.
+
+It is intentionally separate from the main text checkpoint:
+
+- The main `config.json` remains Llama-compatible.
+- The GGUF export remains text-only.
+- These files do not make the released model able to view images.
+
+Suggested target design:
+
+- Tiny patch encoder: 3-channel images to a small vision width.
+- Projection: vision width to the 1024-dimensional Forge text hidden size.
+- Prefix tokens: projected visual tokens can be prepended to the text sequence in a future custom multimodal training run.
+
+Approximate extra parameters for the scaffold design are well under 1M, keeping the total system under 400M parameters.
--- a/vision_adapter/config.json
+++ b/vision_adapter/config.json
@@ -0,0 +1,8 @@
+{
+  "image_size": 224,
+  "patch_size": 16,
+  "vision_width": 128,
+  "text_hidden_size": 1024,
+  "num_prefix_tokens": 16,
+  "status": "experimental_untrained_scaffold"
+}
--- a/vision_adapter/forge_micro_vision.py
+++ b/vision_adapter/forge_micro_vision.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class ForgeMicroVisionConfig:
+    image_size: int = 224
+    patch_size: int = 16
+    vision_width: int = 128
+    text_hidden_size: int = 1024
+    num_prefix_tokens: int = 16
+
+
+class ForgeMicroVisionAdapter(nn.Module):
+    """Tiny untrained image-to-prefix adapter scaffold for Forge-1V experiments."""
+
+    def __init__(self, config: ForgeMicroVisionConfig = ForgeMicroVisionConfig()):
+        super().__init__()
+        self.config = config
+        self.patch_embed = nn.Conv2d(
+            3,
+            config.vision_width,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.pool = nn.AdaptiveAvgPool1d(config.num_prefix_tokens)
+        self.norm = nn.LayerNorm(config.vision_width)
+        self.proj = nn.Linear(config.vision_width, config.text_hidden_size, bias=False)
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        patches = self.patch_embed(images)
+        tokens = patches.flatten(2)
+        tokens = self.pool(tokens).transpose(1, 2)
+        tokens = self.norm(tokens)
+        return self.proj(tokens)
--- a/vision_adapter/vision_adapter/README.md
+++ b/vision_adapter/vision_adapter/README.md
@@ -0,0 +1,21 @@
+---
+license: mit
+---
+
+# Forge-1V Micro Vision Adapter Scaffold
+
+This folder contains an experimental, untrained micro vision-adapter scaffold for future Forge-1V work.
+
+It is intentionally separate from the main text checkpoint:
+
+- The main `config.json` remains Llama-compatible.
+- The GGUF export remains text-only.
+- These files do not make the released model able to view images.
+
+Suggested target design:
+
+- Tiny patch encoder: 3-channel images to a small vision width.
+- Projection: vision width to the 1024-dimensional Forge text hidden size.
+- Prefix tokens: projected visual tokens can be prepended to the text sequence in a future custom multimodal training run.
+
+Approximate extra parameters for the scaffold design are well under 1M, keeping the total system under 400M parameters.
--- a/vision_adapter/vision_adapter/config.json
+++ b/vision_adapter/vision_adapter/config.json
@@ -0,0 +1,8 @@
+{
+  "image_size": 224,
+  "patch_size": 16,
+  "vision_width": 128,
+  "text_hidden_size": 1024,
+  "num_prefix_tokens": 16,
+  "status": "experimental_untrained_scaffold"
+}
--- a/vision_adapter/vision_adapter/forge_micro_vision.py
+++ b/vision_adapter/vision_adapter/forge_micro_vision.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class ForgeMicroVisionConfig:
+    image_size: int = 224
+    patch_size: int = 16
+    vision_width: int = 128
+    text_hidden_size: int = 1024
+    num_prefix_tokens: int = 16
+
+
+class ForgeMicroVisionAdapter(nn.Module):
+    """Tiny untrained image-to-prefix adapter scaffold for Forge-1V experiments."""
+
+    def __init__(self, config: ForgeMicroVisionConfig = ForgeMicroVisionConfig()):
+        super().__init__()
+        self.config = config
+        self.patch_embed = nn.Conv2d(
+            3,
+            config.vision_width,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.pool = nn.AdaptiveAvgPool1d(config.num_prefix_tokens)
+        self.norm = nn.LayerNorm(config.vision_width)
+        self.proj = nn.Linear(config.vision_width, config.text_hidden_size, bias=False)
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        patches = self.patch_embed(images)
+        tokens = patches.flatten(2)
+        tokens = self.pool(tokens).transpose(1, 2)
+        tokens = self.norm(tokens)
+        return self.proj(tokens)