初始化项目,由ModelHub XC社区提供模型
Model: llm-jp/llm-jp-4-8b-instruct Source: Original Platform
This commit is contained in:
101
llmjp4_tokenizer.py
Normal file
101
llmjp4_tokenizer.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# llm-jp-4 tokenizer
|
||||
|
||||
from collections.abc import Sequence
|
||||
import os
|
||||
|
||||
from transformers import LlamaTokenizerFast
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
from .llmjp4_harmony import HarmonyMessageParser, HarmonyMessage
|
||||
|
||||
|
||||
class Llmjp4Tokenizer(LlamaTokenizerFast):
|
||||
_HARMONY_TOKENS: set[str] = {
|
||||
"<|start|>",
|
||||
"<|message|>",
|
||||
"<|channel|>",
|
||||
"<|constrain|>",
|
||||
"<|end|>",
|
||||
"<|return|>",
|
||||
"<|call|>",
|
||||
}
|
||||
|
||||
# NOTE(odashi):
|
||||
# Response schemas are not recognized automatically.
|
||||
# We need to define them manually.
|
||||
# https://github.com/huggingface/trl/issues/4609
|
||||
_RESPONSE_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"role": {"const": "assistant"},
|
||||
"content": {"type": "string", "x-regex": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|<\|return\|>|$)"},
|
||||
"thinking": {"type": "string", "x-regex": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>"},
|
||||
"tool_calls": {
|
||||
"x-regex-iterator": r"<\|channel\|>commentary (to=functions\..*?<\|message\|>.*?)(?:<\|call\|>|$)",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {"const": "function"},
|
||||
"function": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "x-regex": r"^to=functions\.(\w+)"},
|
||||
"arguments": {
|
||||
"type": "object",
|
||||
"x-regex": r"<\|message\|>(.*)",
|
||||
"x-parser": "json",
|
||||
"additionalProperties": {"type": "any"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def convert_to_native_format(cls, **kwargs):
|
||||
# NOTE(odashi):
|
||||
# Workaround for transformers 5.x.
|
||||
# Guaranteeing the same inner behavior with TokenizersBackend.
|
||||
# https://github.com/huggingface/transformers/blob/7d9754a05193eb79b1d86aa744b622b8068008cd/src/transformers/tokenization_utils_tokenizers.py#L110-L116
|
||||
local_kwargs = dict(kwargs)
|
||||
fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
|
||||
if fast_tokenizer_file is None or not os.path.isfile(fast_tokenizer_file):
|
||||
raise ValueError("Tokenizer file must exist.")
|
||||
|
||||
local_kwargs["tokenizer_object"] = Tokenizer.from_file(fast_tokenizer_file)
|
||||
return local_kwargs
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
self.response_schema = self._RESPONSE_SCHEMA
|
||||
|
||||
self._harmony_token_ids = {
|
||||
self.convert_tokens_to_ids(token)
|
||||
for token in self._HARMONY_TOKENS
|
||||
}
|
||||
|
||||
def _decode(self, token_ids: int | list[int], *args, **kwargs):
|
||||
if isinstance(token_ids, int):
|
||||
token_ids = [token_ids]
|
||||
|
||||
result: list[str] = []
|
||||
prev_pos = 0
|
||||
|
||||
# NOTE(odashi):
|
||||
# Ensure that text tokens are decoded without preceding Harmony tokens
|
||||
# to avoid incorrect addition of whitespaces.
|
||||
for pos, token_id in enumerate(token_ids, start=1):
|
||||
if token_id in self._harmony_token_ids or pos == len(token_ids):
|
||||
result.append(super()._decode(token_ids[prev_pos:pos], *args, **kwargs))
|
||||
prev_pos = pos
|
||||
|
||||
return "".join(result)
|
||||
|
||||
def parse_harmony_message(self, token_ids: Sequence[int]) -> list[HarmonyMessage]:
|
||||
"""Helper function to parse token IDs into Harmony messages."""
|
||||
return HarmonyMessageParser(self).get_all_messages(token_ids)
|
||||
Reference in New Issue
Block a user