初始化项目，由ModelHub XC社区提供模型

Model: llm-jp/llm-jp-4-8b-instruct Source: Original Platform
2026-05-05 10:09:39 +08:00
commit 1b1f4f796e
16 changed files with 3997 additions and 0 deletions
--- a/llmjp4_tokenizer.py
+++ b/llmjp4_tokenizer.py
@@ -0,0 +1,101 @@
+# llm-jp-4 tokenizer
+
+from collections.abc import Sequence
+import os
+
+from transformers import LlamaTokenizerFast
+from tokenizers import Tokenizer
+
+from .llmjp4_harmony import HarmonyMessageParser, HarmonyMessage
+
+
+class Llmjp4Tokenizer(LlamaTokenizerFast):
+    _HARMONY_TOKENS: set[str] = {
+        "<|start|>",
+        "<|message|>",
+        "<|channel|>",
+        "<|constrain|>",
+        "<|end|>",
+        "<|return|>",
+        "<|call|>",
+    }
+
+    # NOTE(odashi):
+    # Response schemas are not recognized automatically.
+    # We need to define them manually.
+    # https://github.com/huggingface/trl/issues/4609
+    _RESPONSE_SCHEMA = {
+        "type": "object",
+        "properties": {
+            "role": {"const": "assistant"},
+            "content": {"type": "string", "x-regex": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|<\|return\|>|$)"},
+            "thinking": {"type": "string", "x-regex": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>"},
+            "tool_calls": {
+                "x-regex-iterator": r"<\|channel\|>commentary (to=functions\..*?<\|message\|>.*?)(?:<\|call\|>|$)",
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "type": {"const": "function"},
+                        "function": {
+                            "type": "object",
+                            "properties": {
+                                "name": {"type": "string", "x-regex": r"^to=functions\.(\w+)"},
+                                "arguments": {
+                                    "type": "object",
+                                    "x-regex": r"<\|message\|>(.*)",
+                                    "x-parser": "json",
+                                    "additionalProperties": {"type": "any"},
+                                },
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    }
+
+    @classmethod
+    def convert_to_native_format(cls, **kwargs):
+        # NOTE(odashi):
+        # Workaround for transformers 5.x.
+        # Guaranteeing the same inner behavior with TokenizersBackend.
+        # https://github.com/huggingface/transformers/blob/7d9754a05193eb79b1d86aa744b622b8068008cd/src/transformers/tokenization_utils_tokenizers.py#L110-L116
+        local_kwargs = dict(kwargs)
+        fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
+        if fast_tokenizer_file is None or not os.path.isfile(fast_tokenizer_file):
+            raise ValueError("Tokenizer file must exist.")
+
+        local_kwargs["tokenizer_object"] = Tokenizer.from_file(fast_tokenizer_file)
+        return local_kwargs
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.response_schema = self._RESPONSE_SCHEMA
+
+        self._harmony_token_ids = {
+            self.convert_tokens_to_ids(token)
+            for token in self._HARMONY_TOKENS
+        }
+
+    def _decode(self, token_ids: int | list[int], *args, **kwargs):
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+
+        result: list[str] = []
+        prev_pos = 0
+
+        # NOTE(odashi):
+        # Ensure that text tokens are decoded without preceding Harmony tokens
+        # to avoid incorrect addition of whitespaces.
+        for pos, token_id in enumerate(token_ids, start=1):
+            if token_id in self._harmony_token_ids or pos == len(token_ids):
+                result.append(super()._decode(token_ids[prev_pos:pos], *args, **kwargs))
+                prev_pos = pos
+
+        return "".join(result)
+
+    def parse_harmony_message(self, token_ids: Sequence[int]) -> list[HarmonyMessage]:
+        """Helper function to parse token IDs into Harmony messages."""
+        return HarmonyMessageParser(self).get_all_messages(token_ids)