From 076280a3aeb016e9000b9235c10fb25c7d29ad14 Mon Sep 17 00:00:00 2001
From: aiyueqi <aiyueqi@4paradigm.com>
Date: Mon, 15 Sep 2025 14:42:26 +0800
Subject: [PATCH] translation demo for cambricon mlu370

---
 Dockerfile               |  12 ++
 README.md                |  42 +++++
 fastapi_translate.py     | 139 +++++++++++++++
 logger.py                |  13 ++
 requirements.txt         |   8 +
 test.py                  |  31 ++++
 tokenization_small100.py | 366 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 611 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 fastapi_translate.py
 create mode 100644 logger.py
 create mode 100644 requirements.txt
 create mode 100644 test.py
 create mode 100644 tokenization_small100.py

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..245c2a5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,12 @@
+FROM git.modelhub.org.cn:9443/enginex-cambricon/mlu370-pytorch:v25.01-torch2.5.0-torchmlu1.24.1-ubuntu22.04-py310
+
+WORKDIR /workspace
+
+COPY requirements.txt /workspace
+RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+ADD . /workspace
+
+EXPOSE 80
+CMD ["sh", "-c", "python3 fastapi_translate.py"]
+
diff --git a/README.md b/README.md
index 62ef856..c66025b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,44 @@
 # enginex-mlu370-translation
+# translation-transformers
+## Quickstart
+```shell
+#构建docker镜像
+docker build . -t mlu370_translation
+
+#运行docker容器
+docker run -it -p 10078:80 --device=/dev/cambricon_dev0:/dev/cambricon_dev0 --device=/dev/cambricon_ctl --device=/dev/cambricon_ipcm0:/dev/cambricon_ipcm0 -e MODEL_TYPE=opus_mt -e MODEL_NAME=moxying/opus-mt-zh-en --name mlu370_translation_test mlu370_translation
+```
+等待模型下载完成，出现以下日志时，代表服务启动成功
+```shell
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:80 (Press CTRL+C to quit)
+```
+执行测试程序
+```shell
+python3 test.py
+```
+测试程序执行结果
+```
+Succeed!
+Response: [{'translations': [{'origin_text': '生活就像一块巧克力', 'translated': 'Life is like a piece of chocolate.'}, {'origin_text': '你来自哪里', 'translated': 'Where are you from?'}, {'origin_text': '你吃饭了吗', 'translated': 'Have you eaten yet?'}]}]
+```
+停止docker容器
+```
+docker stop mlu370_translation_test
+```
+## 模型支持
+在Quickstart中运行容器时，通过环境变量的方式，指定模型的类型和具体的模型名称，即：
+```
+-e MODEL_TYPE=opus_mt -e MODEL_NAME=moxying/opus-mt-zh-en
+```
+目前支持以下几种配置:
+| MODEL_TYPE |            MODEL_NAME                   |
+| ---------- | --------------------------------------- |
+| nllb200    | facebook/nllb-200-distilled-600M        |
+| small100   | aiyueqi/alirezamsh_small100             |
+| mbart      | facebook/mbart-large-50-many-to-many-mmt|
+| opus_mt    | moxying/opus-mt-zh-en                   |
+
+其中，MODEL_TYPE代表模型类型，必须为以上表格列举之一；MODEL_NAME是modelscope上面能够拉取到的模型名称，需要和MODEL_TYPE对应
+
 
diff --git a/fastapi_translate.py b/fastapi_translate.py
new file mode 100644
index 0000000..95bae71
--- /dev/null
+++ b/fastapi_translate.py
@@ -0,0 +1,139 @@
+import torch_mlu
+import os
+from fastapi import FastAPI, Query
+from fastapi.responses import PlainTextResponse
+from pydantic import BaseModel
+
+from typing import List, Any
+
+import uvicorn
+
+from modelscope import snapshot_download
+
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import logger
+log = logger.get_logger(__file__)
+
+app = FastAPI()
+status = "Running"
+translator = None
+device = None
+model_type = None
+
+MODEL_TYPE = ("nllb200", "small100", "mbart", "opus_mt")
+MODEL_DIR = "/workspace/model"
+
+class TranslateRequest(BaseModel):
+    Text: str
+
+@app.on_event("startup")
+def load_model():
+    log.info("loading model")
+    global status, translator, device, model_type
+
+    model_type = extract_model_type()
+    log.info(f"model_type={model_type}")
+
+    fetch_model()
+
+    tokenizer, model = get_tokenizer_model()
+    #log.info(f"tokenizer={tokenizer}, model={model}")
+    model = model.to("mlu")
+
+    translator = pipeline(task="translation", model=model, tokenizer=tokenizer, device="mlu", use_cache=True)
+    warm_up()
+
+    status = "Success"
+    log.info("model loaded successfully")
+
+def fetch_model():
+    mn = os.environ.get("MODEL_NAME", "")
+    log.info(f"model_name={mn}")
+
+    os.makedirs(os.path.dirname(MODEL_DIR), exist_ok=True)
+    snapshot_download(mn, local_dir=MODEL_DIR)
+
+def translator_helper(text):
+    source_lang = "zh"
+    target_lang = "en"
+
+    if model_type == "nllb200":
+        source_lang = "zho_Hans"
+        target_lang = "eng_Latn"
+    if model_type == "mbart":
+        source_lang = "zh_CN"
+        target_lang = "en_XX"
+    if model_type == "opus_mt":
+        source_lang = "eng"
+        target_lang = "zho"
+
+    output = translator(text, src_lang=source_lang, tgt_lang=target_lang)
+    log.info(f"model_type={model_type}, src_lang={source_lang}, tgt_lang={target_lang}, output={output}")
+
+    return output
+
+def get_tokenizer_model():
+    if model_type == "small100":
+        from tokenization_small100 import SMALL100Tokenizer
+        tokenizer = SMALL100Tokenizer.from_pretrained(MODEL_DIR)
+        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
+
+    return tokenizer, model
+
+def extract_model_type():
+    mt = os.environ.get("MODEL_TYPE", "")
+    log.info(f"model_type_input={mt}")
+
+    model = mt.lower()
+    if model not in MODEL_TYPE:
+        log.error(f"model_type {model} is not supported")
+        os._exit(1)
+
+    return model
+
+def warm_up():
+    log.info("warming up...")
+
+    warmup_test = translator_helper("今天的天气非常好")
+    log.info(f"warm up completed! model_type={model_type}, response={warmup_test}")
+
+    return warmup_test
+
+@app.get("/v1/get_status")
+async def get_status():
+    ret = {
+        "data": {
+            "status": status
+        }
+    }
+    return ret
+
+@app.post("/v1/translate")
+async def translate(
+    payload: List[TranslateRequest],
+):
+    if not payload:
+        return PlainTextResponse(text="Information missing", status_code=400)
+    results = []
+    texts = []
+    for trans_request in payload:
+        translations = []
+        texts.append(trans_request.Text)
+    
+    outputs = translator_helper(texts)
+    for i in range(0, len(texts)):
+        translations.append({
+            "origin_text": texts[i],
+            "translated": outputs[i]['translation_text']
+        })
+       
+    results.append({
+        "translations": translations
+    })
+    return results
+
+if __name__ == '__main__':
+    uvicorn.run("fastapi_translate:app", host="0.0.0.0", port=80, workers=1, access_log=False)
diff --git a/logger.py b/logger.py
new file mode 100644
index 0000000..d238e70
--- /dev/null
+++ b/logger.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+import logging
+import os
+
+logging.basicConfig(
+    format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO"),
+)
+
+def get_logger(file):
+    return logging.getLogger(file)
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d4de08c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+fastapi
+uvicorn
+sentencepiece==0.2.0
+sacremoses==0.1.1
+protobuf==3.20.3
+modelscope
+transformers==4.45.0
+
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..062e516
--- /dev/null
+++ b/test.py
@@ -0,0 +1,31 @@
+import requests
+url = "http://127.0.0.1:10078/v1/translate"
+
+body = [
+    {"Text": "生活就像一块巧克力"},
+    {"Text": "你来自哪里"},
+    {"Text": "你吃饭了吗"},
+]
+
+headers = {
+    "Content-Type": "application/json",
+    "Accept": "application/json"
+}
+
+try:
+    response = requests.post(
+        url,
+        json=body,
+        headers=headers
+    )
+
+    if response.status_code == 200:
+        print("Succeed!")
+        print("Response:", response.json())
+    else:
+        print(f"Failed，code: {response.status_code}")
+        print("Error detail:", response.text)
+
+except requests.exceptions.RequestException as e:
+    print("request error:", str(e))
+
diff --git a/tokenization_small100.py b/tokenization_small100.py
new file mode 100644
index 0000000..ea6adf1
--- /dev/null
+++ b/tokenization_small100.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2022 Idiap Research Institute, http://www.idiap.ch/
+# Written by Alireza Mohammadshahi <alireza.mohammadshahi@idiap.ch>
+# This is a modified version of https://github.com/huggingface/transformers/blob/main/src/transformers/models/m2m_100/tokenization_m2m_100.py 
+# which owns by Fariseq Authors and The HuggingFace Inc. team.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for SMALL100."""
+import json
+import os
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import sentencepiece
+
+from transformers.tokenization_utils import BatchEncoding, PreTrainedTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "spm_file": "sentencepiece.bpe.model",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "alirezamsh/small100": "https://huggingface.co/alirezamsh/small100/resolve/main/vocab.json",
+    },
+    "spm_file": {
+        "alirezamsh/small100": "https://huggingface.co/alirezamsh/small100/resolve/main/sentencepiece.bpe.model",
+    },
+    "tokenizer_config_file": {
+        "alirezamsh/small100": "https://huggingface.co/alirezamsh/small100/resolve/main/tokenizer_config.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "alirezamsh/small100": 1024,
+}
+
+# fmt: off
+FAIRSEQ_LANGUAGE_CODES = {
+    "m2m100": ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"]
+}
+# fmt: on
+
+
+class SMALL100Tokenizer(PreTrainedTokenizer):
+    """
+    Construct an SMALL100 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        spm_file (`str`):
+            Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+            contains the vocabulary.
+        tgt_lang (`str`, *optional*):
+            A string representing the target language.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        language_codes (`str`, *optional*):
+            What language codes to use. Should be `"m2m100"`.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    Examples:
+    ```python
+    >>> from tokenization_small100 import SMALL100Tokenizer
+    >>> tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100", tgt_lang="ro")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> model(**model_inputs)  # should work
+    ```"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        spm_file,
+        tgt_lang=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language_codes="m2m100",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        num_madeup_words=8,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.language_codes = language_codes
+        fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
+        self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}
+
+        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
+        kwargs["additional_special_tokens"] += [
+            self.get_lang_token(lang_code)
+            for lang_code in fairseq_language_code
+            if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
+        ]
+
+        self.vocab_file = vocab_file
+        self.encoder = load_json(vocab_file)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.spm_file = spm_file
+        self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
+
+        self.encoder_size = len(self.encoder)
+
+        self.lang_token_to_id = {
+            self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)
+        }
+        self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)}
+        self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
+
+        self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
+        self.cur_lang_id = self.get_lang_id(self._tgt_lang)
+        self.num_madeup_words = num_madeup_words
+        
+        super().__init__(
+            tgt_lang=tgt_lang,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            language_codes=language_codes,
+            sp_model_kwargs=self.sp_model_kwargs,
+            num_madeup_words=num_madeup_words,
+            **kwargs,
+        )
+        
+        self.set_lang_special_tokens(self._tgt_lang)
+
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder) + len(self.lang_token_to_id) + self.num_madeup_words
+
+    @property
+    def tgt_lang(self) -> str:
+        return self._tgt_lang
+
+    @tgt_lang.setter
+    def tgt_lang(self, new_tgt_lang: str) -> None:
+        self._tgt_lang = new_tgt_lang
+        self.set_lang_special_tokens(self._tgt_lang)
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        if token in self.lang_token_to_id:
+            return self.lang_token_to_id[token]
+        return self.encoder.get(token, self.encoder[self.unk_token])
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the decoder."""
+        if index in self.id_to_lang_token:
+            return self.id_to_lang_token[index]
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        return self.sp_model.decode(tokens)
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            if self.prefix_tokens is None:
+                return token_ids_0 + self.suffix_tokens
+            else:
+                return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        if self.prefix_tokens is None:
+            return token_ids_0 + token_ids_1 + self.suffix_tokens
+        else:
+            return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def get_vocab(self) -> Dict:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self) -> Dict:
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d: Dict) -> None:
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        save_dir = Path(save_directory)
+        if not save_dir.is_dir():
+            raise OSError(f"{save_directory} should be a directory")
+        vocab_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+        spm_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
+        )
+
+        save_json(self.encoder, vocab_save_path)
+
+        if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
+            copyfile(self.spm_file, spm_save_path)
+        elif not os.path.isfile(self.spm_file):
+            with open(spm_save_path, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (str(vocab_save_path), str(spm_save_path))
+
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        tgt_lang: str = "ro",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.tgt_lang = tgt_lang
+        self.set_lang_special_tokens(self.tgt_lang)
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    def _build_translation_inputs(self, raw_inputs, tgt_lang: Optional[str], **extra_kwargs):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if tgt_lang is None:
+            raise ValueError("Translation requires a `tgt_lang` for this model")
+        self.tgt_lang = tgt_lang
+        inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs)
+        return inputs
+
+    def _switch_to_input_mode(self):
+        self.set_lang_special_tokens(self.tgt_lang)
+
+    def _switch_to_target_mode(self):
+        self.prefix_tokens = None
+        self.suffix_tokens = [self.eos_token_id]        
+
+    def set_lang_special_tokens(self, src_lang: str) -> None:
+        """Reset the special tokens to the tgt lang setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        lang_token = self.get_lang_token(src_lang)
+        self.cur_lang_id = self.lang_token_to_id[lang_token]
+        self.prefix_tokens = [self.cur_lang_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def get_lang_token(self, lang: str) -> str:
+        return self.lang_code_to_token[lang]
+
+    def get_lang_id(self, lang: str) -> int:
+        lang_token = self.get_lang_token(lang)
+        return self.lang_token_to_id[lang_token]
+
+
+def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
+    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
+    spm.Load(str(path))
+    return spm
+
+
+def load_json(path: str) -> Union[Dict, List]:
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def save_json(data, path: str) -> None:
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+