add pkgs
This commit is contained in:
116
examples/utils.py
Normal file
116
examples/utils.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoTokenizer, T5Tokenizer
|
||||
|
||||
DEFAULT_HF_MODEL_DIRS = {
|
||||
'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
|
||||
'bloom': 'bigscience/bloom-560m',
|
||||
'chatglm_6b': 'THUDM/chatglm-6b',
|
||||
'chatglm2_6b': 'THUDM/chatglm2-6b',
|
||||
'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k',
|
||||
'chatglm3_6b': 'THUDM/chatglm3-6b',
|
||||
'chatglm3-6b': 'THUDM/chatglm3-6b',
|
||||
'chatglm3_6b_base': 'THUDM/chatglm3-6b-base',
|
||||
'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k',
|
||||
'falcon': 'tiiuae/falcon-rw-1b',
|
||||
'glm_10b': 'THUDM/glm-10b',
|
||||
'gpt': 'gpt2-medium',
|
||||
'gptj': 'EleutherAI/gpt-j-6b',
|
||||
'gptneox': 'EleutherAI/gpt-neox-20b',
|
||||
'internlm': 'internlm/internlm-chat-7b',
|
||||
'llama': 'meta-llama/Llama-2-7b-hf',
|
||||
'mpt': 'mosaicml/mpt-7b',
|
||||
'opt': 'facebook/opt-350m',
|
||||
'qwen': 'Qwen/Qwen-7B',
|
||||
}
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATES = {
|
||||
'internlm':
|
||||
"<|User|>:{input_text}<eoh>\n<|Bot|>:",
|
||||
'qwen':
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
|
||||
}
|
||||
|
||||
|
||||
def read_model_name_from_config(config_path: Path):
|
||||
with open(config_path, 'r') as f:
|
||||
config = json.load(f)
|
||||
return config['builder_config']['name']
|
||||
|
||||
|
||||
def throttle_generator(generator, stream_interval):
|
||||
for i, out in enumerate(generator):
|
||||
if not i % stream_interval:
|
||||
yield out
|
||||
|
||||
if i % stream_interval:
|
||||
yield out
|
||||
|
||||
|
||||
def load_tokenizer(tokenizer_dir: Optional[str] = None,
|
||||
vocab_file: Optional[str] = None,
|
||||
model_name: str = 'gpt'):
|
||||
if vocab_file is None:
|
||||
# Should set both padding_side and truncation_side to be 'left'
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
||||
legacy=False,
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True)
|
||||
else:
|
||||
# For gpt-next, directly load from tokenizer.model
|
||||
assert model_name == 'gpt'
|
||||
tokenizer = T5Tokenizer(vocab_file=vocab_file,
|
||||
padding_side='left',
|
||||
truncation_side='left')
|
||||
|
||||
if model_name == 'qwen':
|
||||
with open(Path(tokenizer_dir) / "generation_config.json") as f:
|
||||
gen_config = json.load(f)
|
||||
chat_format = gen_config['chat_format']
|
||||
if chat_format == 'raw':
|
||||
pad_id = gen_config['pad_token_id']
|
||||
end_id = gen_config['eos_token_id']
|
||||
elif chat_format == 'chatml':
|
||||
pad_id = tokenizer.im_end_id
|
||||
end_id = tokenizer.im_end_id
|
||||
else:
|
||||
raise Exception(f"unknown chat format: {chat_format}")
|
||||
elif model_name == 'qwen2':
|
||||
with open(Path(tokenizer_dir) / "generation_config.json") as f:
|
||||
gen_config = json.load(f)
|
||||
|
||||
### if model type is chat pad_id = end_id = gen_config["eos_token_id"][0]
|
||||
if isinstance (gen_config["eos_token_id"], list):
|
||||
pad_id = end_id = gen_config["eos_token_id"][0]
|
||||
### if model type is base, run this branch
|
||||
else:
|
||||
pad_id = gen_config["bos_token_id"]
|
||||
end_id = gen_config["eos_token_id"]
|
||||
elif model_name == 'glm_10b':
|
||||
pad_id = tokenizer.pad_token_id
|
||||
end_id = tokenizer.eop_token_id
|
||||
else:
|
||||
if tokenizer.pad_token_id is None:
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
pad_id = tokenizer.pad_token_id
|
||||
end_id = tokenizer.eos_token_id
|
||||
|
||||
return tokenizer, pad_id, end_id
|
||||
Reference in New Issue
Block a user