Files
2025-08-06 15:49:14 +08:00

284 lines
12 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import csv
import json
import os
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer
from tvm.contrib.profiling import Profiler
import xtrt_llm
from xtrt_llm.runtime import ModelConfig, SamplingConfig
from build import get_engine_name # isort:skip
EOS_TOKEN = 2
PAD_TOKEN = 0
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--max_output_len', type=int, required=True)
parser.add_argument('--log_level', type=str, default='error')
parser.add_argument('--model_version',
type=str,
default='v1_13b',
choices=['v1_7b', 'v1_13b', 'v2_7b', 'v2_13b'])
parser.add_argument('--engine_dir', type=str, default='baichuan_outputs')
parser.add_argument('--tokenizer_dir',
type=str,
default="baichuan-inc/Baichuan-13B-Chat",
help="Directory containing the tokenizer.model.")
parser.add_argument('--input_text', type=str, default="解释一下“温故而知新”")
parser.add_argument(
'--input_tokens',
dest='input_file',
type=str,
help=
'CSV or Numpy file containing tokenized input. Alternative to text input.',
default=None)
parser.add_argument('--output_csv',
type=str,
help='CSV file where the tokenized output is stored.',
default=None)
parser.add_argument('--output_npy',
type=str,
help='Numpy file where the tokenized output is stored.',
default=None)
parser.add_argument('--num_beams',
type=int,
help="Use beam search if num_beams >1",
default=1)
parser.add_argument(
'--performance_test_scale',
type=str,
help=
"Scale for performance test. e.g., 8x1024x64 (batch_size, input_text_length, max_output_length)",
default="")
return parser.parse_args()
def generate(
max_output_len: int,
log_level: str = 'error',
model_version: str = 'v1_13b',
engine_dir: str = 'baichuan_outputs',
input_text: str = "解释一下“温故而知新”",
input_file: str = None,
output_csv: str = None,
output_npy: str = None,
tokenizer_dir: str = None,
num_beams: int = 1,
performance_test_scale: str = "",
):
xtrt_llm.logger.set_level(log_level)
config_path = os.path.join(engine_dir, 'config.json')
with open(config_path, 'r') as f:
config = json.load(f)
use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
remove_input_padding = config['plugin_config']['remove_input_padding']
paged_kv_cache = config['plugin_config']['paged_kv_cache']
tokens_per_block = config['plugin_config']['tokens_per_block']
dtype = config['builder_config']['precision']
world_size = config['builder_config']['tensor_parallel']
# assert world_size == xtrt_llm.mpi_world_size(), \
# f'Engine world size ({world_size}) != Runtime world size ({xtrt_llm.mpi_world_size()})'
num_heads = config['builder_config']['num_heads'] // world_size
hidden_size = config['builder_config']['hidden_size'] // world_size
vocab_size = config['builder_config']['vocab_size']
num_layers = config['builder_config']['num_layers']
builder_config = config['builder_config']
gather_all_token_logits = builder_config.get('gather_all_token_logits',
False)
runtime_rank = xtrt_llm.mpi_rank()
if world_size > 1:
os.environ["XCCL_GROUP_ID"] = str(runtime_rank // world_size)
os.environ["XCCL_NRANKS"] = str(world_size)
os.environ["XCCL_CUR_RANK"] = str(runtime_rank % world_size)
os.environ["XCCL_DEVICE_ID"] = str(runtime_rank)
os.environ["MP_RUN"] = str(1)
runtime_mapping = xtrt_llm.Mapping(world_size,
runtime_rank,
tp_size=world_size)
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
use_fast=False,
trust_remote_code=True)
model_config = ModelConfig(num_heads=num_heads,
num_kv_heads=num_heads,
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
gpt_attention_plugin=use_gpt_attention_plugin,
paged_kv_cache=paged_kv_cache,
tokens_per_block=tokens_per_block,
remove_input_padding=remove_input_padding,
dtype=dtype,
gather_all_token_logits=gather_all_token_logits)
repetition_penalty = 1.1
temperature = 0.3
top_k = 5
top_p = 0.85
if args.model_version == 'v1_7b':
temperature = 1
top_k = 1
top_p = 0
elif args.model_version == 'v2_7b' or args.model_version == 'v2_13b':
repetition_penalty = 1.05
sampling_config = SamplingConfig(end_id=EOS_TOKEN,
pad_id=PAD_TOKEN,
num_beams=num_beams,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_k=top_k,
top_p=top_p)
engine_name = get_engine_name('baichuan', dtype, world_size, runtime_rank)
serialize_path = os.path.join(engine_dir, engine_name)
decoder = xtrt_llm.runtime.GenerationSession(model_config, serialize_path,
runtime_mapping)
input_tokens = []
if input_file is None:
input_tokens.append(
tokenizer.encode(input_text, add_special_tokens=False))
else:
if input_file.endswith('.csv'):
with open(input_file, 'r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for line in csv_reader:
input_tokens.append(np.array(line, dtype='int32'))
elif input_file.endswith('.npy'):
inputs = np.load(input_file)
for row in inputs:
row = row[row != EOS_TOKEN]
input_tokens.append(row)
else:
print('Input file format not supported.')
raise SystemExit
input_ids = None
input_lengths = None
if input_file is None:
input_ids = torch.tensor(input_tokens, dtype=torch.int32, device='cuda')
input_lengths = torch.tensor([input_ids.size(1)],
dtype=torch.int32,
device='cuda')
else:
input_lengths = torch.tensor([len(x) for x in input_tokens],
dtype=torch.int32,
device='cuda')
if remove_input_padding:
input_ids = np.concatenate(input_tokens)
input_ids = torch.tensor(input_ids,
dtype=torch.int32,
device='cuda').unsqueeze(0)
else:
input_ids = torch.nested.to_padded_tensor(
torch.nested.nested_tensor(input_tokens, dtype=torch.int32),
EOS_TOKEN).cuda()
if performance_test_scale != "":
performance_test_scale_list = performance_test_scale.split("E")
warmup_epochs = 3
for scale in performance_test_scale_list:
for i in range(warmup_epochs):
xtrt_llm.logger.info(
f"Running performance test with scale {scale}")
bs, seqlen, max_output_len = [int(x) for x in scale.split("x")]
try:
_input_ids = torch.from_numpy(
np.zeros((bs, seqlen)).astype("int32")).cuda()
_input_lengths = torch.from_numpy(
np.full((bs, ), seqlen).astype("int32")).cuda()
max_input_length = torch.max(_input_lengths).item()
decoder.setup(_input_lengths.size(0), max_input_length,
max_output_len, num_beams)
with Profiler(f'{bs}_{seqlen}_{max_output_len}_decode',
show_report=True):
output_ids = decoder.decode(
_input_ids,
_input_lengths,
sampling_config,
stop_words_list=[tokenizer.eos_token_id])
except Exception as e:
xtrt_llm.logger.info(
f"Error occurs in performance test: {e}.")
exit(0)
max_input_length = torch.max(input_lengths).item()
decoder.setup(input_lengths.size(0),
max_input_length,
max_output_len,
beam_width=num_beams)
output_ids = decoder.decode(input_ids,
input_lengths,
sampling_config,
stop_words_list=[tokenizer.eos_token_id])
torch.cuda.synchronize()
if runtime_rank == 0:
if output_csv is None and output_npy is None:
for b in range(input_lengths.size(0)):
inputs = input_tokens[b]
input_text = tokenizer.decode(inputs)
print(f'Input: \"{input_text}\"')
if num_beams <= 1:
output_begin = max_input_length
outputs = output_ids[b][0][output_begin:].tolist()
output_text = tokenizer.decode(outputs)
print(f'Output: \"{output_text}\"')
else:
for beam in range(num_beams):
output_begin = input_lengths[b]
output_end = input_lengths[b] + max_output_len
outputs = output_ids[b][beam][
output_begin:output_end].tolist()
output_text = tokenizer.decode(outputs)
print(f'Output: \"{output_text}\"')
output_ids = output_ids.reshape((-1, output_ids.size(2)))
if output_csv is not None:
output_file = Path(output_csv)
output_file.parent.mkdir(exist_ok=True, parents=True)
outputs = output_ids.tolist()
with open(output_file, 'w') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerows(outputs)
if output_npy is not None:
output_file = Path(output_npy)
output_file.parent.mkdir(exist_ok=True, parents=True)
outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
np.save(output_file, outputs)
return
if __name__ == '__main__':
args = parse_arguments()
generate(**vars(args))