init

2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions
--- a/transformers/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/transformers/examples/legacy/multiple_choice/run_multiple_choice.py
@@ -0,0 +1,243 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from utils_multiple_choice import MultipleChoiceDataset, Split, processors
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import is_main_process
+
+
+logger = logging.getLogger(__name__)
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
+    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
+            " --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    try:
+        processor = processors[data_args.task_name]()
+        label_list = processor.get_labels()
+        num_labels = len(label_list)
+    except KeyError:
+        raise ValueError("Task not found: %s" % (data_args.task_name))
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    model = AutoModelForMultipleChoice.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # Get datasets
+    train_dataset = (
+        MultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.train,
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        MultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.dev,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    def compute_metrics(p: EvalPrediction) -> dict:
+        preds = np.argmax(p.predictions, axis=1)
+        return {"acc": simple_accuracy(preds, p.label_ids)}
+
+    # Data collator
+    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        compute_metrics=compute_metrics,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        result = trainer.evaluate()
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in result.items():
+                    logger.info("  %s = %s", key, value)
+                    writer.write("{} = {}\n".format(key, value))
+
+                results.update(result)
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/transformers/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/transformers/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -0,0 +1,483 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension"""
+
+import csv
+import glob
+import json
+import logging
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+import tqdm
+from filelock import FileLock
+
+from transformers import PreTrainedTokenizer, is_torch_available
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class InputExample:
+    """
+    A single training/test example for multiple choice
+
+    Args:
+        example_id: Unique id for the example.
+        question: string. The untokenized text of the second sequence (question).
+        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
+        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
+        label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+
+    example_id: str
+    question: str
+    contexts: list[str]
+    endings: list[str]
+    label: Optional[str]
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    """
+
+    example_id: str
+    input_ids: list[list[int]]
+    attention_mask: Optional[list[list[int]]]
+    token_type_ids: Optional[list[list[int]]]
+    label: Optional[int]
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import Dataset
+
+    class MultipleChoiceDataset(Dataset):
+        features: list[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = None,
+            overwrite_cache=False,
+            mode: Split = Split.train,
+        ):
+            processor = processors[task]()
+
+            cached_features_file = os.path.join(
+                data_dir,
+                "cached_{}_{}_{}_{}".format(
+                    mode.value,
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
+                ),
+            )
+
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            lock_path = cached_features_file + ".lock"
+            with FileLock(lock_path):
+                if os.path.exists(cached_features_file) and not overwrite_cache:
+                    logger.info(f"Loading features from cached file {cached_features_file}")
+                    self.features = torch.load(cached_features_file, weights_only=True)
+                else:
+                    logger.info(f"Creating features from dataset file at {data_dir}")
+                    label_list = processor.get_labels()
+                    if mode == Split.dev:
+                        examples = processor.get_dev_examples(data_dir)
+                    elif mode == Split.test:
+                        examples = processor.get_test_examples(data_dir)
+                    else:
+                        examples = processor.get_train_examples(data_dir)
+                    logger.info("Training examples: %s", len(examples))
+                    self.features = convert_examples_to_features(
+                        examples,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                    )
+                    logger.info("Saving features into cached file %s", cached_features_file)
+                    torch.save(self.features, cached_features_file)
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]
+
+
+class DataProcessor:
+    """Base class for data converters for multiple choice data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+
+class RaceProcessor(DataProcessor):
+    """Processor for the RACE data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        high = os.path.join(data_dir, "train/high")
+        middle = os.path.join(data_dir, "train/middle")
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        high = os.path.join(data_dir, "dev/high")
+        middle = os.path.join(data_dir, "dev/middle")
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} test")
+        high = os.path.join(data_dir, "test/high")
+        middle = os.path.join(data_dir, "test/middle")
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_txt(self, input_dir):
+        lines = []
+        files = glob.glob(input_dir + "/*txt")
+        for file in tqdm.tqdm(files, desc="read files"):
+            with open(file, encoding="utf-8") as fin:
+                data_raw = json.load(fin)
+                data_raw["race_id"] = file
+                lines.append(data_raw)
+        return lines
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for _, data_raw in enumerate(lines):
+            race_id = "{}-{}".format(set_type, data_raw["race_id"])
+            article = data_raw["article"]
+            for i in range(len(data_raw["answers"])):
+                truth = str(ord(data_raw["answers"][i]) - ord("A"))
+                question = data_raw["questions"][i]
+                options = data_raw["options"][i]
+
+                examples.append(
+                    InputExample(
+                        example_id=race_id,
+                        question=question,
+                        contexts=[article, article, article, article],  # this is not efficient but convenient
+                        endings=[options[0], options[1], options[2], options[3]],
+                        label=truth,
+                    )
+                )
+        return examples
+
+
+class SynonymProcessor(DataProcessor):
+    """Processor for the Synonym data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3", "4"]
+
+    def _read_csv(self, input_file):
+        with open(input_file, encoding="utf-8") as f:
+            return list(csv.reader(f))
+
+    def _create_examples(self, lines: list[list[str]], type: str):
+        """Creates examples for the training and dev sets."""
+
+        examples = [
+            InputExample(
+                example_id=line[0],
+                question="",  # in the swag dataset, the
+                # common beginning of each
+                # choice is stored in "sent2".
+                contexts=[line[1], line[1], line[1], line[1], line[1]],
+                endings=[line[2], line[3], line[4], line[5], line[6]],
+                label=line[7],
+            )
+            for line in lines  # we skip the line with the column names
+        ]
+
+        return examples
+
+
+class SwagProcessor(DataProcessor):
+    """Processor for the SWAG data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        raise ValueError(
+            "For swag testing, the input file does not contain a label column. It can not be tested in current code "
+            "setting!"
+        )
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_csv(self, input_file):
+        with open(input_file, encoding="utf-8") as f:
+            return list(csv.reader(f))
+
+    def _create_examples(self, lines: list[list[str]], type: str):
+        """Creates examples for the training and dev sets."""
+        if type == "train" and lines[0][-1] != "label":
+            raise ValueError("For training, the input file must contain a label column.")
+
+        examples = [
+            InputExample(
+                example_id=line[2],
+                question=line[5],  # in the swag dataset, the
+                # common beginning of each
+                # choice is stored in "sent2".
+                contexts=[line[4], line[4], line[4], line[4]],
+                endings=[line[7], line[8], line[9], line[10]],
+                label=line[11],
+            )
+            for line in lines[1:]  # we skip the line with the column names
+        ]
+
+        return examples
+
+
+class ArcProcessor(DataProcessor):
+    """Processor for the ARC data set (request from allennlp)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
+
+    def get_test_examples(self, data_dir):
+        logger.info(f"LOOKING AT {data_dir} test")
+        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_json(self, input_file):
+        with open(input_file, encoding="utf-8") as fin:
+            lines = fin.readlines()
+            return lines
+
+    def _create_examples(self, lines, type):
+        """Creates examples for the training and dev sets."""
+
+        # There are two types of labels. They should be normalized
+        def normalize(truth):
+            if truth in "ABCD":
+                return ord(truth) - ord("A")
+            elif truth in "1234":
+                return int(truth) - 1
+            else:
+                logger.info("truth ERROR! %s", str(truth))
+                return None
+
+        examples = []
+        three_choice = 0
+        four_choice = 0
+        five_choice = 0
+        other_choices = 0
+        # we deleted example which has more than or less than four choices
+        for line in tqdm.tqdm(lines, desc="read arc data"):
+            data_raw = json.loads(line.strip("\n"))
+            if len(data_raw["question"]["choices"]) == 3:
+                three_choice += 1
+                continue
+            elif len(data_raw["question"]["choices"]) == 5:
+                five_choice += 1
+                continue
+            elif len(data_raw["question"]["choices"]) != 4:
+                other_choices += 1
+                continue
+            four_choice += 1
+            truth = str(normalize(data_raw["answerKey"]))
+            assert truth != "None"
+            question_choices = data_raw["question"]
+            question = question_choices["stem"]
+            id = data_raw["id"]
+            options = question_choices["choices"]
+            if len(options) == 4:
+                examples.append(
+                    InputExample(
+                        example_id=id,
+                        question=question,
+                        contexts=[
+                            options[0]["para"].replace("_", ""),
+                            options[1]["para"].replace("_", ""),
+                            options[2]["para"].replace("_", ""),
+                            options[3]["para"].replace("_", ""),
+                        ],
+                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
+                        label=truth,
+                    )
+                )
+
+        if type == "train":
+            assert len(examples) > 1
+            assert examples[0].label is not None
+        logger.info("len examples: %s}", str(len(examples)))
+        logger.info("Three choices: %s", str(three_choice))
+        logger.info("Five choices: %s", str(five_choice))
+        logger.info("Other choices: %s", str(other_choices))
+        logger.info("four choices: %s", str(four_choice))
+
+        return examples
+
+
+def convert_examples_to_features(
+    examples: list[InputExample],
+    label_list: list[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+) -> list[InputFeatures]:
+    """
+    Loads a data file into a list of `InputFeatures`
+    """
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        choices_inputs = []
+        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
+            text_a = context
+            if example.question.find("_") != -1:
+                # this is for cloze question
+                text_b = example.question.replace("_", ending)
+            else:
+                text_b = example.question + " " + ending
+
+            inputs = tokenizer(
+                text_a,
+                text_b,
+                add_special_tokens=True,
+                max_length=max_length,
+                padding="max_length",
+                truncation=True,
+                return_overflowing_tokens=True,
+            )
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are popping question + options, "
+                    "you need to try to use a bigger max seq length!"
+                )
+
+            choices_inputs.append(inputs)
+
+        label = label_map[example.label]
+
+        input_ids = [x["input_ids"] for x in choices_inputs]
+        attention_mask = (
+            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
+        )
+        token_type_ids = (
+            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
+        )
+
+        features.append(
+            InputFeatures(
+                example_id=example.example_id,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                label=label,
+            )
+        )
+
+    for f in features[:2]:
+        logger.info("*** Example ***")
+        logger.info("feature: %s" % f)
+
+    return features
+
+
+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}