From 076280a3aeb016e9000b9235c10fb25c7d29ad14 Mon Sep 17 00:00:00 2001 From: aiyueqi Date: Mon, 15 Sep 2025 14:42:26 +0800 Subject: [PATCH] translation demo for cambricon mlu370 --- Dockerfile | 12 ++ README.md | 42 +++++ fastapi_translate.py | 139 +++++++++++++++ logger.py | 13 ++ requirements.txt | 8 + test.py | 31 ++++ tokenization_small100.py | 366 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 611 insertions(+) create mode 100644 Dockerfile create mode 100644 fastapi_translate.py create mode 100644 logger.py create mode 100644 requirements.txt create mode 100644 test.py create mode 100644 tokenization_small100.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..245c2a5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM git.modelhub.org.cn:9443/enginex-cambricon/mlu370-pytorch:v25.01-torch2.5.0-torchmlu1.24.1-ubuntu22.04-py310 + +WORKDIR /workspace + +COPY requirements.txt /workspace +RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + +ADD . /workspace + +EXPOSE 80 +CMD ["sh", "-c", "python3 fastapi_translate.py"] + diff --git a/README.md b/README.md index 62ef856..c66025b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,44 @@ # enginex-mlu370-translation +# translation-transformers +## Quickstart +```shell +#构建docker镜像 +docker build . -t mlu370_translation + +#运行docker容器 +docker run -it -p 10078:80 --device=/dev/cambricon_dev0:/dev/cambricon_dev0 --device=/dev/cambricon_ctl --device=/dev/cambricon_ipcm0:/dev/cambricon_ipcm0 -e MODEL_TYPE=opus_mt -e MODEL_NAME=moxying/opus-mt-zh-en --name mlu370_translation_test mlu370_translation +``` +等待模型下载完成,出现以下日志时,代表服务启动成功 +```shell +INFO: Application startup complete. +INFO: Uvicorn running on http://0.0.0.0:80 (Press CTRL+C to quit) +``` +执行测试程序 +```shell +python3 test.py +``` +测试程序执行结果 +``` +Succeed! +Response: [{'translations': [{'origin_text': '生活就像一块巧克力', 'translated': 'Life is like a piece of chocolate.'}, {'origin_text': '你来自哪里', 'translated': 'Where are you from?'}, {'origin_text': '你吃饭了吗', 'translated': 'Have you eaten yet?'}]}] +``` +停止docker容器 +``` +docker stop mlu370_translation_test +``` +## 模型支持 +在Quickstart中运行容器时,通过环境变量的方式,指定模型的类型和具体的模型名称,即: +``` +-e MODEL_TYPE=opus_mt -e MODEL_NAME=moxying/opus-mt-zh-en +``` +目前支持以下几种配置: +| MODEL_TYPE | MODEL_NAME | +| ---------- | --------------------------------------- | +| nllb200 | facebook/nllb-200-distilled-600M | +| small100 | aiyueqi/alirezamsh_small100 | +| mbart | facebook/mbart-large-50-many-to-many-mmt| +| opus_mt | moxying/opus-mt-zh-en | + +其中,MODEL_TYPE代表模型类型,必须为以上表格列举之一;MODEL_NAME是modelscope上面能够拉取到的模型名称,需要和MODEL_TYPE对应 + diff --git a/fastapi_translate.py b/fastapi_translate.py new file mode 100644 index 0000000..95bae71 --- /dev/null +++ b/fastapi_translate.py @@ -0,0 +1,139 @@ +import torch_mlu +import os +from fastapi import FastAPI, Query +from fastapi.responses import PlainTextResponse +from pydantic import BaseModel + +from typing import List, Any + +import uvicorn + +from modelscope import snapshot_download + +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline +import logger +log = logger.get_logger(__file__) + +app = FastAPI() +status = "Running" +translator = None +device = None +model_type = None + +MODEL_TYPE = ("nllb200", "small100", "mbart", "opus_mt") +MODEL_DIR = "/workspace/model" + +class TranslateRequest(BaseModel): + Text: str + +@app.on_event("startup") +def load_model(): + log.info("loading model") + global status, translator, device, model_type + + model_type = extract_model_type() + log.info(f"model_type={model_type}") + + fetch_model() + + tokenizer, model = get_tokenizer_model() + #log.info(f"tokenizer={tokenizer}, model={model}") + model = model.to("mlu") + + translator = pipeline(task="translation", model=model, tokenizer=tokenizer, device="mlu", use_cache=True) + warm_up() + + status = "Success" + log.info("model loaded successfully") + +def fetch_model(): + mn = os.environ.get("MODEL_NAME", "") + log.info(f"model_name={mn}") + + os.makedirs(os.path.dirname(MODEL_DIR), exist_ok=True) + snapshot_download(mn, local_dir=MODEL_DIR) + +def translator_helper(text): + source_lang = "zh" + target_lang = "en" + + if model_type == "nllb200": + source_lang = "zho_Hans" + target_lang = "eng_Latn" + if model_type == "mbart": + source_lang = "zh_CN" + target_lang = "en_XX" + if model_type == "opus_mt": + source_lang = "eng" + target_lang = "zho" + + output = translator(text, src_lang=source_lang, tgt_lang=target_lang) + log.info(f"model_type={model_type}, src_lang={source_lang}, tgt_lang={target_lang}, output={output}") + + return output + +def get_tokenizer_model(): + if model_type == "small100": + from tokenization_small100 import SMALL100Tokenizer + tokenizer = SMALL100Tokenizer.from_pretrained(MODEL_DIR) + model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR) + else: + tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) + model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR) + + return tokenizer, model + +def extract_model_type(): + mt = os.environ.get("MODEL_TYPE", "") + log.info(f"model_type_input={mt}") + + model = mt.lower() + if model not in MODEL_TYPE: + log.error(f"model_type {model} is not supported") + os._exit(1) + + return model + +def warm_up(): + log.info("warming up...") + + warmup_test = translator_helper("今天的天气非常好") + log.info(f"warm up completed! model_type={model_type}, response={warmup_test}") + + return warmup_test + +@app.get("/v1/get_status") +async def get_status(): + ret = { + "data": { + "status": status + } + } + return ret + +@app.post("/v1/translate") +async def translate( + payload: List[TranslateRequest], +): + if not payload: + return PlainTextResponse(text="Information missing", status_code=400) + results = [] + texts = [] + for trans_request in payload: + translations = [] + texts.append(trans_request.Text) + + outputs = translator_helper(texts) + for i in range(0, len(texts)): + translations.append({ + "origin_text": texts[i], + "translated": outputs[i]['translation_text'] + }) + + results.append({ + "translations": translations + }) + return results + +if __name__ == '__main__': + uvicorn.run("fastapi_translate:app", host="0.0.0.0", port=80, workers=1, access_log=False) diff --git a/logger.py b/logger.py new file mode 100644 index 0000000..d238e70 --- /dev/null +++ b/logger.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +import logging +import os + +logging.basicConfig( + format="%(asctime)s %(name)-12s %(levelname)-4s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO"), +) + +def get_logger(file): + return logging.getLogger(file) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d4de08c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +fastapi +uvicorn +sentencepiece==0.2.0 +sacremoses==0.1.1 +protobuf==3.20.3 +modelscope +transformers==4.45.0 + diff --git a/test.py b/test.py new file mode 100644 index 0000000..062e516 --- /dev/null +++ b/test.py @@ -0,0 +1,31 @@ +import requests +url = "http://127.0.0.1:10078/v1/translate" + +body = [ + {"Text": "生活就像一块巧克力"}, + {"Text": "你来自哪里"}, + {"Text": "你吃饭了吗"}, +] + +headers = { + "Content-Type": "application/json", + "Accept": "application/json" +} + +try: + response = requests.post( + url, + json=body, + headers=headers + ) + + if response.status_code == 200: + print("Succeed!") + print("Response:", response.json()) + else: + print(f"Failed,code: {response.status_code}") + print("Error detail:", response.text) + +except requests.exceptions.RequestException as e: + print("request error:", str(e)) + diff --git a/tokenization_small100.py b/tokenization_small100.py new file mode 100644 index 0000000..ea6adf1 --- /dev/null +++ b/tokenization_small100.py @@ -0,0 +1,366 @@ +# Copyright (c) 2022 Idiap Research Institute, http://www.idiap.ch/ +# Written by Alireza Mohammadshahi +# This is a modified version of https://github.com/huggingface/transformers/blob/main/src/transformers/models/m2m_100/tokenization_m2m_100.py +# which owns by Fariseq Authors and The HuggingFace Inc. team. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for SMALL100.""" +import json +import os +from pathlib import Path +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple, Union + +import sentencepiece + +from transformers.tokenization_utils import BatchEncoding, PreTrainedTokenizer +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "spm_file": "sentencepiece.bpe.model", + "tokenizer_config_file": "tokenizer_config.json", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "alirezamsh/small100": "https://huggingface.co/alirezamsh/small100/resolve/main/vocab.json", + }, + "spm_file": { + "alirezamsh/small100": "https://huggingface.co/alirezamsh/small100/resolve/main/sentencepiece.bpe.model", + }, + "tokenizer_config_file": { + "alirezamsh/small100": "https://huggingface.co/alirezamsh/small100/resolve/main/tokenizer_config.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "alirezamsh/small100": 1024, +} + +# fmt: off +FAIRSEQ_LANGUAGE_CODES = { + "m2m100": ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"] +} +# fmt: on + + +class SMALL100Tokenizer(PreTrainedTokenizer): + """ + Construct an SMALL100 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + Args: + vocab_file (`str`): + Path to the vocabulary file. + spm_file (`str`): + Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that + contains the vocabulary. + tgt_lang (`str`, *optional*): + A string representing the target language. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + language_codes (`str`, *optional*): + What language codes to use. Should be `"m2m100"`. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + Examples: + ```python + >>> from tokenization_small100 import SMALL100Tokenizer + >>> tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100", tgt_lang="ro") + >>> src_text = " UN Chief Says There Is No Military Solution in Syria" + >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" + >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") + >>> model(**model_inputs) # should work + ```""" + + vocab_files_names = VOCAB_FILES_NAMES + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ["input_ids", "attention_mask"] + + prefix_tokens: List[int] = [] + suffix_tokens: List[int] = [] + + def __init__( + self, + vocab_file, + spm_file, + tgt_lang=None, + bos_token="", + eos_token="", + sep_token="", + pad_token="", + unk_token="", + language_codes="m2m100", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + num_madeup_words=8, + **kwargs, + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.language_codes = language_codes + fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes] + self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code} + + kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) + kwargs["additional_special_tokens"] += [ + self.get_lang_token(lang_code) + for lang_code in fairseq_language_code + if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"] + ] + + self.vocab_file = vocab_file + self.encoder = load_json(vocab_file) + self.decoder = {v: k for k, v in self.encoder.items()} + self.spm_file = spm_file + self.sp_model = load_spm(spm_file, self.sp_model_kwargs) + + self.encoder_size = len(self.encoder) + + self.lang_token_to_id = { + self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code) + } + self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)} + self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()} + + self._tgt_lang = tgt_lang if tgt_lang is not None else "en" + self.cur_lang_id = self.get_lang_id(self._tgt_lang) + self.num_madeup_words = num_madeup_words + + super().__init__( + tgt_lang=tgt_lang, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + language_codes=language_codes, + sp_model_kwargs=self.sp_model_kwargs, + num_madeup_words=num_madeup_words, + **kwargs, + ) + + self.set_lang_special_tokens(self._tgt_lang) + + + @property + def vocab_size(self) -> int: + return len(self.encoder) + len(self.lang_token_to_id) + self.num_madeup_words + + @property + def tgt_lang(self) -> str: + return self._tgt_lang + + @tgt_lang.setter + def tgt_lang(self, new_tgt_lang: str) -> None: + self._tgt_lang = new_tgt_lang + self.set_lang_special_tokens(self._tgt_lang) + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + if token in self.lang_token_to_id: + return self.lang_token_to_id[token] + return self.encoder.get(token, self.encoder[self.unk_token]) + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the decoder.""" + if index in self.id_to_lang_token: + return self.id_to_lang_token[index] + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + return self.sp_model.decode(tokens) + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An MBART sequence has the following format, where `X` represents the sequence: + - `input_ids` (for encoder) `X [eos, src_lang_code]` + - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]` + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + if self.prefix_tokens is None: + return token_ids_0 + self.suffix_tokens + else: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + if self.prefix_tokens is None: + return token_ids_0 + token_ids_1 + self.suffix_tokens + else: + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def get_vocab(self) -> Dict: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self) -> Dict: + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d: Dict) -> None: + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + save_dir = Path(save_directory) + if not save_dir.is_dir(): + raise OSError(f"{save_directory} should be a directory") + vocab_save_path = save_dir / ( + (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"] + ) + spm_save_path = save_dir / ( + (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"] + ) + + save_json(self.encoder, vocab_save_path) + + if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file): + copyfile(self.spm_file, spm_save_path) + elif not os.path.isfile(self.spm_file): + with open(spm_save_path, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (str(vocab_save_path), str(spm_save_path)) + + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + tgt_lang: str = "ro", + **kwargs, + ) -> BatchEncoding: + self.tgt_lang = tgt_lang + self.set_lang_special_tokens(self.tgt_lang) + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + def _build_translation_inputs(self, raw_inputs, tgt_lang: Optional[str], **extra_kwargs): + """Used by translation pipeline, to prepare inputs for the generate function""" + if tgt_lang is None: + raise ValueError("Translation requires a `tgt_lang` for this model") + self.tgt_lang = tgt_lang + inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs) + return inputs + + def _switch_to_input_mode(self): + self.set_lang_special_tokens(self.tgt_lang) + + def _switch_to_target_mode(self): + self.prefix_tokens = None + self.suffix_tokens = [self.eos_token_id] + + def set_lang_special_tokens(self, src_lang: str) -> None: + """Reset the special tokens to the tgt lang setting. No prefix and suffix=[eos, tgt_lang_code].""" + lang_token = self.get_lang_token(src_lang) + self.cur_lang_id = self.lang_token_to_id[lang_token] + self.prefix_tokens = [self.cur_lang_id] + self.suffix_tokens = [self.eos_token_id] + + def get_lang_token(self, lang: str) -> str: + return self.lang_code_to_token[lang] + + def get_lang_id(self, lang: str) -> int: + lang_token = self.get_lang_token(lang) + return self.lang_token_to_id[lang_token] + + +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) + spm.Load(str(path)) + return spm + + +def load_json(path: str) -> Union[Dict, List]: + with open(path, "r") as f: + return json.load(f) + + +def save_json(data, path: str) -> None: + with open(path, "w") as f: + json.dump(data, f, indent=2) +