init
This commit is contained in:
54
transformers/tests/deepspeed/ds_config_zero2.json
Normal file
54
transformers/tests/deepspeed/ds_config_zero2.json
Normal file
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 2e8,
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
56
transformers/tests/deepspeed/ds_config_zero3.json
Normal file
56
transformers/tests/deepspeed/ds_config_zero3.json
Normal file
@@ -0,0 +1,56 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "none",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "none",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
1480
transformers/tests/deepspeed/test_deepspeed.py
Normal file
1480
transformers/tests/deepspeed/test_deepspeed.py
Normal file
File diff suppressed because it is too large
Load Diff
373
transformers/tests/deepspeed/test_model_zoo.py
Normal file
373
transformers/tests/deepspeed/test_model_zoo.py
Normal file
@@ -0,0 +1,373 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import subprocess
|
||||
from os.path import dirname
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from tests.trainer.test_trainer import TrainerIntegrationCommon # noqa
|
||||
from transformers import is_torch_available
|
||||
from transformers.testing_utils import (
|
||||
TestCasePlus,
|
||||
backend_device_count,
|
||||
execute_subprocess_async,
|
||||
get_tests_dir,
|
||||
require_deepspeed,
|
||||
require_torch_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.trainer_utils import set_seed
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from tests.trainer.test_trainer import ( # noqa
|
||||
RegressionModelConfig,
|
||||
RegressionPreTrainedModel,
|
||||
get_regression_trainer,
|
||||
)
|
||||
|
||||
|
||||
set_seed(42)
|
||||
|
||||
FIXTURE_DIRECTORY = get_tests_dir("fixtures")
|
||||
ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir()))
|
||||
DS_TESTS_DIRECTORY = dirname(os.path.abspath(__file__))
|
||||
|
||||
# default torch.distributed port
|
||||
DEFAULT_MASTER_PORT = "10999"
|
||||
|
||||
T5_SMALL = "google-t5/t5-small"
|
||||
|
||||
# *** Working Models ***
|
||||
ALBERT_TINY = "hf-internal-testing/tiny-albert"
|
||||
BART_TINY = "sshleifer/bart-tiny-random"
|
||||
BERT_TINY = "hf-internal-testing/tiny-bert"
|
||||
BIGBIRD_PEGASUS_TINY = "hf-internal-testing/tiny-random-bigbird_pegasus"
|
||||
BIG_BIRD_TINY = "hf-internal-testing/tiny-random-big_bird"
|
||||
BLENDERBOT_TINY = "hf-internal-testing/tiny-random-blenderbot"
|
||||
BLOOM_TINY = "bigscience/bigscience-small-testing"
|
||||
DEBERTA_TINY = "hf-internal-testing/tiny-random-deberta"
|
||||
DEBERTA_V2_TINY = "hf-internal-testing/tiny-random-deberta-v2"
|
||||
DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased"
|
||||
ELECTRA_TINY = "hf-internal-testing/tiny-electra"
|
||||
FLAUBERT_TINY = "hf-internal-testing/tiny-random-flaubert"
|
||||
FSMT_TINY = "stas/tiny-wmt19-en-de"
|
||||
FUNNEL_TINY = "hf-internal-testing/tiny-random-funnel"
|
||||
GPT2_TINY = "sshleifer/tiny-gpt2"
|
||||
GPTJ_TINY = "hf-internal-testing/tiny-random-gptj"
|
||||
GPT_NEO_TINY = "hf-internal-testing/tiny-random-gpt_neo"
|
||||
LAYOUTLM_TINY = "hf-internal-testing/tiny-layoutlm"
|
||||
LED_TINY = "hf-internal-testing/tiny-random-led"
|
||||
LONGFORMER_TINY = "hf-internal-testing/tiny-random-longformer"
|
||||
M2M_100_TINY = "stas/tiny-m2m_100" # hf tiny model is unsuitable
|
||||
MARIAN_TINY = "sshleifer/tiny-marian-en-de"
|
||||
MBART_TINY = "sshleifer/tiny-mbart"
|
||||
MOBILEBERT_TINY = "hf-internal-testing/tiny-random-mobilebert"
|
||||
MPNET_TINY = "hf-internal-testing/tiny-random-mpnet"
|
||||
PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random"
|
||||
PROPHETNET_TINY = "hf-internal-testing/tiny-random-prophetnet"
|
||||
ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"
|
||||
SQUEEZEBERT_TINY = "hf-internal-testing/tiny-random-squeezebert"
|
||||
T5_TINY = "patrickvonplaten/t5-tiny-random"
|
||||
T5_V1_TINY = "hf-internal-testing/tiny-random-t5-v1.1"
|
||||
VIT_TINY = "hf-internal-testing/tiny-random-vit"
|
||||
XLM_ROBERTA_TINY = "hf-internal-testing/tiny-xlm-roberta"
|
||||
XLNET_TINY = "sshleifer/tiny-xlnet-base-cased"
|
||||
|
||||
|
||||
# *** To Fix ***
|
||||
|
||||
|
||||
# *** tiny model issues ***
|
||||
# missing model files:
|
||||
MT5_TINY = "hf-internal-testing/tiny-random-mt5"
|
||||
CAMEMBERT_TINY = "hf-internal-testing/tiny-random-camembert"
|
||||
OPENAI_GPT_TINY = "hf-internal-testing/tiny-random-openai-gpt"
|
||||
|
||||
# missing tokenizer files
|
||||
CONVBERT_TINY = "hf-internal-testing/tiny-random-convbert"
|
||||
LAYOUTLMV2_TINY = "hf-internal-testing/tiny-random-layoutlmv2"
|
||||
HUBERT_TINY = "hf-internal-testing/tiny-random-hubert"
|
||||
|
||||
# issues with tokenizer
|
||||
CTRL_TINY = "hf-internal-testing/tiny-random-ctrl"
|
||||
TRANSFO_XL_TINY = "hf-internal-testing/tiny-random-transfo-xl" # same as Salesforce/ctrl
|
||||
|
||||
# other issues with tiny models
|
||||
IBERT_TINY = "hf-internal-testing/tiny-random-ibert" # multiple issues with either mlm/qa/clas
|
||||
REFORMER_TINY = "hf-internal-testing/tiny-random-reformer" # multiple issues with either mlm/qa/clas
|
||||
|
||||
# *** Lacking official examples to test with ***
|
||||
# or not working with examples
|
||||
DPR_TINY = "hf-internal-testing/tiny-random-dpr"
|
||||
# - "dpr" examples/research_projects/rag-end2end-retriever/
|
||||
RAG_TINY = "hf-internal-testing/tiny-random-rag"
|
||||
# - "rag" research_projects
|
||||
LUKE_TINY = ""
|
||||
# - "luke" Entities classes - no plan to make such example
|
||||
LXMERT_TINY = "hf-internal-testing/tiny-random-lxmert"
|
||||
# - "lxmert" doesn't work with run_qa.py
|
||||
CLIP_TINY = "hf-internal-testing/tiny-random-clip"
|
||||
# - "clip" nothing under pytorch examples - XXX: Suraj is working on adding some - check by end of Sep
|
||||
SPEECH_TO_TEXT_TINY = "hf-internal-testing/tiny-random-speech_to_text"
|
||||
# - "speech_to_text", nothing under pytorch examples
|
||||
|
||||
|
||||
# *** Reactive mode ***
|
||||
# models with low usage, unstable API, things about to change - do nothing about the following until someone runs into a problem
|
||||
TAPAS_TINY = "hf-internal-testing/tiny-random-tapas"
|
||||
# additional notes on tapas
|
||||
# 1. "Table must be of type pd.DataFrame" failure
|
||||
|
||||
|
||||
# TODO: new models to add:
|
||||
#
|
||||
|
||||
|
||||
def get_launcher(distributed=False):
|
||||
# 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
|
||||
# - it won't be able to handle that
|
||||
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
||||
# results with mode gpus because we use very little data)
|
||||
num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1
|
||||
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
||||
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
|
||||
|
||||
|
||||
def make_task_cmds():
|
||||
data_dir_samples = f"{FIXTURE_DIRECTORY}/tests_samples"
|
||||
data_dir_wmt = f"{data_dir_samples}/wmt_en_ro"
|
||||
data_dir_xsum = f"{data_dir_samples}/xsum"
|
||||
args_main = """
|
||||
--do_train
|
||||
--max_train_samples 4
|
||||
--per_device_train_batch_size 2
|
||||
--num_train_epochs 1
|
||||
--fp16
|
||||
--report_to none
|
||||
--overwrite_output_dir
|
||||
""".split()
|
||||
|
||||
# try to cover as many models as possible once (it's enough to run on one task per model)
|
||||
# but need a tiny model for each
|
||||
#
|
||||
# should have "{model_type.upper()}_TINY" corresponding vars defined, e.g., T5_TINY, etc.
|
||||
tasks2models = {
|
||||
"trans": [
|
||||
"bart",
|
||||
"fsmt",
|
||||
"m2m_100",
|
||||
"marian",
|
||||
"mbart",
|
||||
"t5",
|
||||
"t5_v1",
|
||||
# "mt5", missing model files
|
||||
],
|
||||
"sum": [
|
||||
"pegasus",
|
||||
],
|
||||
"clm": [
|
||||
"big_bird",
|
||||
"bigbird_pegasus",
|
||||
"blenderbot",
|
||||
"bloom",
|
||||
"gpt2",
|
||||
"gpt_neo",
|
||||
"gptj",
|
||||
"xlm-roberta",
|
||||
"prophetnet",
|
||||
# "camembert", missing model files
|
||||
],
|
||||
"mlm": [
|
||||
"albert",
|
||||
"deberta",
|
||||
"deberta-v2",
|
||||
"distilbert",
|
||||
"electra",
|
||||
"flaubert",
|
||||
"funnel",
|
||||
"layoutlm",
|
||||
# "reformer", # multiple issues with either mlm/qa/clas
|
||||
],
|
||||
"qa": [
|
||||
"led",
|
||||
"longformer",
|
||||
"mobilebert",
|
||||
"mpnet",
|
||||
"roberta",
|
||||
"squeezebert",
|
||||
# "convbert", # missing tokenizer files
|
||||
# "layoutlmv2", missing model files
|
||||
],
|
||||
"clas": [
|
||||
"bert",
|
||||
"xlnet",
|
||||
# "hubert", # missing tokenizer files
|
||||
# "ibert", # multiple issues with either mlm/qa/clas
|
||||
# "transfo-xl", # tokenizer issues as Salesforce/ctrl
|
||||
# "Salesforce/ctrl", # tokenizer issues
|
||||
# "openai-community/openai-gpt", missing model files
|
||||
# "tapas", multiple issues
|
||||
],
|
||||
"img_clas": [
|
||||
"vit",
|
||||
],
|
||||
}
|
||||
|
||||
scripts_dir = f"{ROOT_DIRECTORY}/examples/pytorch"
|
||||
|
||||
tasks = {
|
||||
"trans": f"""
|
||||
{scripts_dir}/translation/run_translation.py
|
||||
--train_file {data_dir_wmt}/train.json
|
||||
--source_lang en
|
||||
--target_lang ro
|
||||
--max_source_length 12
|
||||
--max_target_length 12
|
||||
""",
|
||||
"sum": f"""
|
||||
{scripts_dir}/summarization/run_summarization.py
|
||||
--train_file {data_dir_xsum}/sample.json
|
||||
--max_source_length 12
|
||||
--max_target_length 12
|
||||
--lang en
|
||||
""",
|
||||
"clm": f"""
|
||||
{scripts_dir}/language-modeling/run_clm.py
|
||||
--train_file {FIXTURE_DIRECTORY}/sample_text.txt
|
||||
--block_size 8
|
||||
""",
|
||||
"mlm": f"""
|
||||
{scripts_dir}/language-modeling/run_mlm.py
|
||||
--train_file {FIXTURE_DIRECTORY}/sample_text.txt
|
||||
""",
|
||||
"qa": f"""
|
||||
{scripts_dir}/question-answering/run_qa.py
|
||||
--train_file {data_dir_samples}/SQUAD/sample.json
|
||||
""",
|
||||
"clas": f"""
|
||||
{scripts_dir}/text-classification/run_glue.py
|
||||
--train_file {data_dir_samples}/MRPC/train.csv
|
||||
--max_seq_length 12
|
||||
--task_name MRPC
|
||||
""",
|
||||
"img_clas": f"""
|
||||
{scripts_dir}/image-classification/run_image_classification.py
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--remove_unused_columns False
|
||||
--max_steps 10
|
||||
--image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
|
||||
--label_column_name labels
|
||||
""",
|
||||
}
|
||||
|
||||
launcher = get_launcher(distributed=True)
|
||||
|
||||
cmds = {}
|
||||
for task, args in tasks.items():
|
||||
args = args.split()
|
||||
for model in tasks2models[task]:
|
||||
model_name = globals()[f"{model.upper().replace('-', '_')}_TINY"]
|
||||
args_model = f"--model_name_or_path {model_name}".split()
|
||||
cmds[f"{task}_{model}"] = launcher + args + args_model + args_main
|
||||
|
||||
# # generation special case
|
||||
# if task == "gen":
|
||||
# launcher = f"deepspeed --num_nodes 1 --num_gpus 1".split()
|
||||
# args_model += f"--model_type {model}".split()
|
||||
# cmds[f"{task}_{model}"] = launcher + args + args_model
|
||||
# else:
|
||||
|
||||
return cmds
|
||||
|
||||
|
||||
task_cmds = make_task_cmds()
|
||||
|
||||
ZERO2 = "zero2"
|
||||
ZERO3 = "zero3"
|
||||
|
||||
stages = [ZERO2, ZERO3]
|
||||
|
||||
# future preparation:
|
||||
# for now test just fp16, as these tests are quite slow
|
||||
# FP16 = "fp16"
|
||||
# BF16 = "bf16"
|
||||
#
|
||||
# dtypes = [FP16]
|
||||
# so just hardcoding --fp16 for now
|
||||
# if is_torch_bf16_gpu_available():
|
||||
# dtypes += [BF16]
|
||||
|
||||
|
||||
def parameterized_custom_name_func(func, param_num, param):
|
||||
# customize the test name generator function as we want both params to appear in the sub-test
|
||||
# name, as by default it shows only the first param
|
||||
param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
|
||||
return f"{func.__name__}_{param_based_name}"
|
||||
|
||||
|
||||
# Cartesian-product of zero stages with models to test
|
||||
params = list(itertools.product(stages, task_cmds.keys()))
|
||||
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_deepspeed
|
||||
@require_torch_accelerator
|
||||
class TestDeepSpeedModelZoo(TestCasePlus):
|
||||
"""This class is for testing via an external script - can do multiple gpus"""
|
||||
|
||||
def get_task_cmd(self, task, stage):
|
||||
# return a ready to run train cmd
|
||||
if task not in task_cmds:
|
||||
raise ValueError(f"don't know of task {task}, have {task_cmds.keys()}")
|
||||
|
||||
cmd = task_cmds[task]
|
||||
args_ds = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
|
||||
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args_out = f"--output_dir {output_dir}".split()
|
||||
|
||||
cmd += args_ds + args_out
|
||||
|
||||
return cmd, output_dir
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
def test_zero_to_fp32(self, stage, task):
|
||||
# testing the ability to do a run followed by recovery of full fp32 weights
|
||||
|
||||
cmd, output_dir = self.get_task_cmd(task, stage)
|
||||
|
||||
# 1. generate the checkpoint
|
||||
cmd += "--save_steps 1".split()
|
||||
# keep for quick debug
|
||||
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] + cmd)); die
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
# 2. test that the fp32 weights get reconsolidated
|
||||
chkpt_dir = f"{output_dir}/checkpoint-1"
|
||||
recovered_model_path = f"{chkpt_dir}/out.bin"
|
||||
cmd = f"{chkpt_dir}/zero_to_fp32.py {chkpt_dir} {recovered_model_path}"
|
||||
# keep for quick debug
|
||||
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
|
||||
subprocess.check_call(cmd, shell=True)
|
||||
assert os.path.exists(recovered_model_path), f"{recovered_model_path} was not found"
|
||||
|
||||
# possibly could also test that the resulting saved model is usable but given that we use
|
||||
# random models we won't know if it's any good
|
||||
4
transformers/tests/deepspeed/vit_feature_extractor.json
Normal file
4
transformers/tests/deepspeed/vit_feature_extractor.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"feature_extractor_type": "ViTFeatureExtractor",
|
||||
"size": 30
|
||||
}
|
||||
Reference in New Issue
Block a user