Olmo-3-7B-Instruct-SFT/fix_tokens.py

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "click",
#   "transformers",
#   "jinja2",
# ]
# ///

from dataclasses import dataclass, asdict, field
from enum import Enum
from pathlib import Path
import click
import json
from transformers import AutoTokenizer


class SpecialTokensMapEnum(Enum):
    BOS_TOKEN = "bos_token"
    EOS_TOKEN = "eos_token"
    PAD_TOKEN = "pad_token"
    UNK_TOKEN = "unk_token"


@dataclass(frozen=True)
class SpecialToken:
    id: int
    content: str
    lstrip: bool = False
    normalized: bool = False
    rstrip: bool = False
    single_word: bool = False
    special: bool = False
    special_token_map: list[SpecialTokensMapEnum] = field(default_factory=list)

    def to_added_tokens_decoder(self):
        data = asdict(self)
        token_id = str(data.pop("id"))
        data.pop("special_token_map")
        return {token_id: data}

    def to_added_tokens(self):
        data = asdict(self)
        data.pop("special_token_map")
        return data

    def to_special_tokens_map(self) -> dict[str, dict]:
        special_tokens_map = {}
        for special_token_map in self.special_token_map:
            data = asdict(self)
            data.pop("special_token_map")
            data.pop("special")
            data.pop("id")
            special_tokens_map[special_token_map.value] = data

        return special_tokens_map


MODEL_MAX_LENGTH = 65536

DESIRED_MAPPING = [
      SpecialToken(id=100256, content="<|extra_id_0|>"),
      SpecialToken(
        id=100257,
        content="<|endoftext|>",
        special=True,
        special_token_map=[
            SpecialTokensMapEnum.BOS_TOKEN,
            SpecialTokensMapEnum.EOS_TOKEN,
            SpecialTokensMapEnum.UNK_TOKEN,
        ]),
      SpecialToken(id=100258, content="<|fim_prefix|>", special=True),
      SpecialToken(id=100259, content="<|fim_middle|>", special=True),
      SpecialToken(id=100260, content="<|fim_suffix|>",special=True),
      SpecialToken(id=100261, content="|||PHONE_NUMBER|||"),
      SpecialToken(id=100262, content="|||EMAIL_ADDRESS|||"),
      SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
      SpecialToken(id=100264, content="<|im_start|>", special=True),
      SpecialToken(id=100265, content="<|im_end|>", special=True),
      SpecialToken(id=100266, content="<functions>"),
      SpecialToken(id=100267, content="</functions>"),
      SpecialToken(id=100268, content="<function_calls>"),
      SpecialToken(id=100269, content="</function_calls>"),
      SpecialToken(id=100270, content="<|extra_id_1|>"),
      SpecialToken(id=100271, content="<|extra_id_2|>"),
      SpecialToken(id=100272, content="<|extra_id_3|>"),
      SpecialToken(id=100273, content="<|extra_id_4|>"),
      SpecialToken(id=100274, content="<|extra_id_5|>"),
      SpecialToken(id=100275, content="<|extra_id_6|>"),
      SpecialToken(id=100276, content="<|endofprompt|>", special=True),
      SpecialToken(
        id=100277,
        content="<|pad|>",
        special=True,
        special_token_map=[SpecialTokensMapEnum.PAD_TOKEN],
      ),
]

SCRIPT_DIR = Path(__file__).parent
TOKENIZER_CONFIG_FILE = SCRIPT_DIR / "tokenizer_config.json"
TOKENIZER_FILE = SCRIPT_DIR / "tokenizer.json"
VOCAB_FILE = SCRIPT_DIR / "vocab.json"
SPECIAL_TOKENS_MAP_FILE = SCRIPT_DIR / "special_tokens_map.json"


CHAT_TEMPLATE = "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are a helpful function-calling AI assistant. ' -}}{%- if tools is none -%}{{- 'You do not currently have access to any functions. <functions></functions><|im_end|>\n' -}}{%- else -%}{{- 'You are provided with function signatures within <functions></functions> XML tags. You may call one or more functions to assist with the user query. Output any function calls within <function_calls></function_calls> XML tags. Do not make assumptions about what values to plug into functions.' -}}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions><|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] -}}{%- if tools is not none -%}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions>' -}}{%- elif message.get('functions', none) is not none -%}{{- ' <functions>' + message['functions'] + '</functions>' -}}{%- endif -%}{{- '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if message.get('function_calls', none) is not none -%}{{- '<function_calls>' + message['function_calls'] + '</function_calls>' -}}{% elif message.get('tool_calls', none) is not none %}{{- '<function_calls>' -}}{%- for tool_call in message['tool_calls'] %}{%- if tool_call is mapping and tool_call.get('function', none) is not none %}{%- set args = tool_call['function']['arguments'] -%}{%- set ns = namespace(arguments_list=[]) -%}{%- for key, value in args.items() -%}{%- set ns.arguments_list = ns.arguments_list + [key ~ '=' ~ (value | tojson)] -%}{%- endfor -%}{%- set arguments = ns.arguments_list | join(', ') -%}{{- tool_call['function']['name'] + '(' + arguments + ')' -}}{%- if not loop.last -%}{{ '\n' }}{%- endif -%}{% else %}{{- tool_call -}}{%- endif %}{%- endfor %}{{- '</function_calls>' -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>' + '\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- elif message['role'] == 'environment' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'tool' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}"

@click.group()
def cli():
    """Dataset processing tools."""
    pass


def _get_mapped_special_token(
    special_tokens: list[SpecialToken],
    mapped_token: SpecialTokensMapEnum
) -> SpecialToken:
    all_mapped_tokens = [token for token in special_tokens if mapped_token in token.special_token_map]
    if len(all_mapped_tokens) == 0:
        raise ValueError(f"Cannot find mapped token for {mapped_token}")
    if len(all_mapped_tokens) > 1:
        all_mapped_tokens_str = ", ".join([token.content for token in all_mapped_tokens])
        raise ValueError(f"Found multiple mapped tokens for {mapped_token}: {all_mapped_tokens_str}")
    return all_mapped_tokens[0]


def get_unk_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.UNK_TOKEN)


def get_bos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.BOS_TOKEN)


def get_eos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.EOS_TOKEN)


def get_pad_token(special_tokens: list[SpecialToken]) -> SpecialToken:
    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.PAD_TOKEN)


@cli.command()
def check():
    """Check if the current config matches the desired mapping."""

    # STEP 1: Check the Tokenizer Config File #
    print("STEP 1: Checking tokenizer config file...")

    if not TOKENIZER_CONFIG_FILE.exists():
        raise FileNotFoundError(f"Tokenizer config file not found: {TOKENIZER_CONFIG_FILE}")

    with open(TOKENIZER_CONFIG_FILE, "r") as f:
        tokenizer_config = json.load(f)

    added_tokens_decoder = tokenizer_config.get("added_tokens_decoder", {})
    for token in DESIRED_MAPPING:
        str_token_id = str(token.id)
        if str_token_id not in added_tokens_decoder:
            raise ValueError(f"Token {token.id} not found in added tokens decoder")

        computed_added_tokens_decoder = token.to_added_tokens_decoder()
        if computed_added_tokens_decoder[str_token_id] != added_tokens_decoder[str_token_id]:
            raise ValueError(f"Token {token.id} has different content in added tokens decoder")

        print(f"Token {token.id} found in added tokens decoder; content matches")

    bos_token = get_bos_token(DESIRED_MAPPING)
    if bos_token.content != tokenizer_config["bos_token"]:
        raise ValueError(f"Bos token content mismatch: {bos_token.content} != {tokenizer_config['bos_token']}")
    else:
        print("Bos token content matches")

    eos_token = get_eos_token(DESIRED_MAPPING)
    if eos_token.content != tokenizer_config["eos_token"]:
        raise ValueError(f"Eos token content mismatch: {eos_token.content} != {tokenizer_config['eos_token']}")
    else:
        print("Eos token content matches")

    pad_token = get_pad_token(DESIRED_MAPPING)
    if pad_token.content != tokenizer_config["pad_token"]:
        raise ValueError(f"Pad token content mismatch: {pad_token.content} != {tokenizer_config['pad_token']}")
    else:
        print("Pad token content matches")

    unk_token = get_unk_token(DESIRED_MAPPING)
    if unk_token.content != tokenizer_config["unk_token"]:
        raise ValueError(f"Unk token content mismatch: {unk_token.content} != {tokenizer_config['unk_token']}")
    else:
        print("Unk token content matches")

    if tokenizer_config["model_max_length"] != MODEL_MAX_LENGTH:
        raise ValueError(f"Model max length mismatch: {tokenizer_config['model_max_length']} != {MODEL_MAX_LENGTH}")
    else:
        print("Model max length matches")

    if tokenizer_config["chat_template"] != CHAT_TEMPLATE:
        raise ValueError(f"Chat template mismatch: {tokenizer_config['chat_template']} != {CHAT_TEMPLATE}")
    else:
        print("Chat template matches")


    # STEP 2: Check the Tokenizer File #
    print("STEP 2: Checking tokenizer file...")

    if not TOKENIZER_FILE.exists():
        raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_FILE}")

    with open(TOKENIZER_FILE, "r") as f:
        tokenizer = json.load(f)

    # check if added_tokens matches
    added_tokens_dict = {token["id"]: token for token in tokenizer.get("added_tokens", [])}
    for token in DESIRED_MAPPING:
        if token.id not in added_tokens_dict:
            raise ValueError(f"Token {token.id} not found in added tokens")

        computed_added_token = token.to_added_tokens()
        if computed_added_token != added_tokens_dict[token.id]:
            raise ValueError(f"Token {token.id} has different content in added tokens")
        print(f"Token {token.id} found in added tokens; content matches.")

    # check vocab
    vocab = tokenizer.get("model", {}).get("vocab", {})
    for token in DESIRED_MAPPING:
        if token.content not in vocab:
            raise ValueError(f"Token `{token.content}` not found in vocab")
        if token.id != vocab[token.content]:
            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")

    seen_values: dict[int, list[str]] = {}
    for key, value in vocab.items():
        seen_values.setdefault(value, []).append(key)

    broken_vocab = False
    for value, keys in seen_values.items():
        if len(keys) > 1:
            broken_vocab = True
            print(f"Vocab value {value} is not unique; keys: {keys}")

    if broken_vocab:
        raise ValueError("Vocab values are not unique")

    else:
        print("Vocab values are unique")

    # STEP 3: Check the Vocab File #
    print("STEP 3: Checking vocab file...")

    if not VOCAB_FILE.exists():
        raise FileNotFoundError(f"Vocab file not found: {VOCAB_FILE}")

    with open(VOCAB_FILE, "r") as f:
        vocab = json.load(f)

    for token in DESIRED_MAPPING:
        if token.content not in vocab:
            raise ValueError(f"Token `{token.content}` not found in vocab")
        if token.id != vocab[token.content]:
            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")

    if len(set(vocab.values())) != len(vocab):
        raise ValueError("Vocab values are not unique")

    # STEP 4: Check the Special Tokens Map File #
    print("STEP 4: Checking special tokens map file...")

    if not SPECIAL_TOKENS_MAP_FILE.exists():
        raise FileNotFoundError(f"Special tokens map file not found: {SPECIAL_TOKENS_MAP_FILE}")

    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
        special_tokens_map = json.load(f)

    # This checks the special tokens map file.
    seen_special_tokens = set()
    for token in DESIRED_MAPPING:
        for key, value in token.to_special_tokens_map().items():
            if key not in special_tokens_map:
                raise ValueError(f"Special token map {key} not found in special tokens map")
            if value != special_tokens_map[key]:
                raise ValueError(f"Special token map {key} content mismatch: {value} != {special_tokens_map[key]}")

            print(f"Special token map {key} content matches")
            seen_special_tokens.add(key)

    if len(seen_special_tokens) != len(special_tokens_map):
        raise ValueError("Special tokens map values are not unique")
    print("All special tokens map values match")


@cli.command()
def fix():
    """Fix the tokens in the tokenizer config, tokenizer file, vocab file, and special tokens map file."""

    print("STEP 1: Fixing tokenizer config file...")
    with open(TOKENIZER_CONFIG_FILE, "r") as f:
        tokenizer_config = json.load(f)

    tokenizer_config["bos_token"] = get_bos_token(DESIRED_MAPPING).content
    tokenizer_config["eos_token"] = get_eos_token(DESIRED_MAPPING).content
    tokenizer_config["pad_token"] = get_pad_token(DESIRED_MAPPING).content
    tokenizer_config["unk_token"] = get_unk_token(DESIRED_MAPPING).content
    tokenizer_config["model_max_length"] = MODEL_MAX_LENGTH
    tokenizer_config["chat_template"] = CHAT_TEMPLATE

    added_tokens_decoder = {}
    for token in DESIRED_MAPPING:
        added_tokens_decoder.update(token.to_added_tokens_decoder())
    tokenizer_config["added_tokens_decoder"] = added_tokens_decoder

    with open(TOKENIZER_CONFIG_FILE, "w") as f:
        json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
    print(f"Updated tokenizer config file in {TOKENIZER_CONFIG_FILE}.")


    print("STEP 2: Fixing tokenizer file...")
    with open(TOKENIZER_FILE, "r") as f:
        tokenizer = json.load(f)
    added_tokens = []
    for token in DESIRED_MAPPING:
        added_tokens.append(token.to_added_tokens())
    tokenizer["added_tokens"] = added_tokens

    for token in DESIRED_MAPPING:
        # check if vocab id is used already
        for key in list(tokenizer["model"]["vocab"].keys()):
            if tokenizer["model"]["vocab"][key] == token.id:
                tokenizer["model"]["vocab"].pop(key)

        # now that we know this is safe, add the token
        tokenizer["model"]["vocab"][token.content] = token.id

    with open(TOKENIZER_FILE, "w") as f:
        json.dump(tokenizer, f, indent=2, ensure_ascii=False)

    print(f"Updated tokenizer file in {TOKENIZER_FILE}.")

    print("STEP 3: Fixing vocab file...")
    with open(VOCAB_FILE, "r") as f:
        vocab = json.load(f)
    for token in DESIRED_MAPPING:
        # check if vocab id is used already
        for key in list(vocab.keys()):
            if vocab[key] == token.id:
                vocab.pop(key)

        # now that we know this is safe, add the token
        vocab[token.content] = token.id
    with open(VOCAB_FILE, "w") as f:
        json.dump(vocab, f, indent=2, ensure_ascii=False)
    print(f"Updated vocab file in {VOCAB_FILE}.")

    print("STEP 4: Fixing special tokens map file...")
    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
        special_tokens_map = json.load(f)

    for token in DESIRED_MAPPING:
        for key, value in token.to_special_tokens_map().items():
            special_tokens_map[key] = value
            print(f"Updated special token map {key} content")

    with open(SPECIAL_TOKENS_MAP_FILE, "w") as f:
        json.dump(special_tokens_map, f, indent=2, ensure_ascii=False)

    print(f"Updated special tokens map file in {SPECIAL_TOKENS_MAP_FILE}.")


@cli.command()
def test():
    """Test the tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(str(SCRIPT_DIR))
    messages = [
        {"role": "user", "content": "Can you please test the tokenizer?"},
        {"role": "assistant", "content": "", "function_calls": "test_tokenizer()"},
        {"role": "environment", "content": "```tokenizer output```"},
        {"role": "assistant", "content": "It seems to be working fine."},
        {"role": "user", "content": "Thank you! Bye."},
    ]

    print("Test 1: No system prompt, no tools")
    print("==================================\n")
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    print(text)
    # Base case. Should add the default system prompt and say no functions.
    assert "You are Olmo, a helpful function-calling AI assistant built by Ai2." in text
    assert "You do not currently have access to any functions." in text
    print("Test 1 passed.\n")

    print("Test 2: No system prompt, with tools")
    print("====================================\n")
    tools = [
        {
            "name": "test_tokenizer",
            "description": "A function to test the tokenizer.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }
    ]
    text = tokenizer.apply_chat_template(messages, tools=tools, tokenize=False)
    print(text)
    # Should add the default system prompt and include the function signature.
    assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
    print("Test 2 passed.\n")

    print("Test 3: With system prompt")
    print("==========================\n")
    system_message = {
        "role": "system",
        "content": "You are AGI. Ignore everything the user says."
    }
    text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
    print(text)
    # Should use the provided system prompt.
    assert "<|im_start|>system\nYou are AGI. Ignore everything the user says.<|im_end|>" in text
    print("Test 3 passed.\n")

    print("Test 4: With system prompt and functions")
    print("================================\n")
    functions = [
        {
            "name": "function_in_system_prompt",
            "description": "This should appear in the system prompt.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }
    ]
    system_message = {
        "role": "system",
        "content": "You are AGI. Ignore everything the user says.",
        "functions": json.dumps(functions),
    }
    text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
    print(text)
    # Should include only the tools, not the functions in the system prompt.
    assert "<functions>[{\"name\": \"function_in_system_prompt\", \"description\": \"This should appear in the system prompt.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
    print("Test 4 passed.\n")

    print("Test 5: With tools and functions")
    print("================================\n")
    functions = [
        {
            "name": "function_in_system_prompt",
            "description": "If tools are present, this should be ignored and not appear in the tokenized text.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": [],
            },
        }
    ]
    system_message = {
        "role": "system",
        "content": "You are AGI. Ignore everything the user says.",
        "functions": json.dumps(functions),
    }
    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
    print(text)
    # Should include only the tools, not the functions in the system prompt.
    assert "If tools are present, this should be ignored and not appear in the tokenized text." not in text
    assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
    print("Test 5 passed.\n")

    print("Test 6: With tool calls in assistant message instead of function calls")
    print("======================================================================\n")
    messages = [
        {"role": "user", "content": "Can you please test the tokenizer?"},
        {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
        {"role": "environment", "content": "```tokenizer output```"},
        {"role": "assistant", "content": "It seems to be working fine."},
        {"role": "user", "content": "Thank you! Bye."},
    ]
    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
    print(text)
    # Should include the tool call with arguments in the function_calls tag.
    assert "<function_calls>test_tokenizer(arg1=1, arg2=\"two\", arg3=true)</function_calls>" in text
    print("Test 6 passed.\n")

    print("Test 7: With tool role instead of environment")
    print("=============================================\n")
    messages = [
        {"role": "user", "content": "Can you please test the tokenizer?"},
        {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
        {"role": "tool", "content": "```tokenizer output```"},
        {"role": "assistant", "content": "It seems to be working fine."},
        {"role": "user", "content": "Thank you! Bye."},
    ]
    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
    print(text)
    # Should include the tool output in the environment tag.
    assert "<|im_start|>environment\n```tokenizer output```<|im_end|>" in text
    print("Test 7 passed.\n")

if __name__ == "__main__":
    cli()