enginex-mlu370-any2any/transformers/tests/utils/test_auto_docstring.py

# coding=utf-8
# Copyright 2024 The HuggingFace Team Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a clone of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest


LLAMA_CLM_FORWARD = """        The [`LlamaForCausalLM`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n                [`PreTrainedTokenizer.__call__`] for details.\n\n                [What are input IDs?](../glossary#input-ids)\n            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):\n                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n                Two formats are allowed:\n                    - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n                    - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n                    shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n                    cache format.\n\n                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n                legacy cache format will be returned.\n\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n                of shape `(batch_size, sequence_length)`.\n            inputs_embeds (`Optional[torch.FloatTensor]`):\n                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n                model\'s internal embedding lookup matrix.\n            labels (`Optional[torch.LongTensor]`) of shape `(batch_size, sequence_length)`:\n                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,\n                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored\n                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.\n            use_cache (`Optional[bool]`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            output_attentions (`Optional[bool]`):\n                Whether or not to return the attentions tensors of all attention layers. S

LLAMA_MODEL_DOCSTRING = """        The [`LlamaModel`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n                [`PreTrainedTokenizer.__call__`] for details.\n\n                [What are input IDs?](../glossary#input-ids)\n            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):\n                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n                Two formats are allowed:\n                    - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n                    - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n                    shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n                    cache format.\n\n                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n                legacy cache format will be returned.\n\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't\n                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n                of shape `(batch_size, sequence_length)`.\n            inputs_embeds (`Optional[torch.FloatTensor]`):\n                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n                model's internal embedding lookup matrix.\n            use_cache (`Optional[bool]`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            output_attentions (`Optional[bool]`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n                tensors for more detail.\n            output_hidden_states (`Optional[bool]`):\n                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n                more detail.\n            cache_position (`Optional[torch.LongTensor]`):\n                Indices depicting the position of the input sequence tokens in the sequence. Contraril

LLAMA_DECODER = """        The [`LlamaDecoderLayer`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):deprecated in favor of `past_key_values`            output_attentions (`Optional[bool]`, defaults to `False`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n                tensors for more detail.\n            use_cache (`Optional[bool]`, defaults to `False`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            cache_position (`Optional[torch.LongTensor]`):\n                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n                the complete sequence length.\n            position_embeddings (`Optional[Tuple[torch.Tensor, torch.Tensor]]`):\n                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,\n                with `head_dim` being the embedding dimension of each attention head.\n\n        Returns:\n            `Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]`"""

LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC = """        The [`LlamaForSequenceClassification`] forward method, overrides the `__call__` special method.\n\n        <Tip>\n\n        Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n        instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n        the latter silently ignores them.\n\n        </Tip>\n\n        Args:\n            input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n                [`PreTrainedTokenizer.__call__`] for details.\n\n                [What are input IDs?](../glossary#input-ids)\n            attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n                - 1 for tokens that are **not masked**,\n                - 0 for tokens that are **masked**.\n\n                [What are attention masks?](../glossary#attention-mask)\n            position_ids (`Optional[torch.LongTensor]`):\n                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n                [What are position IDs?](../glossary#position-ids)\n            past_key_values (`Optional[~cache_utils.Cache]`):\n                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n                Two formats are allowed:\n                    - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n                    - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n                    shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n                    cache format.\n\n                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n                legacy cache format will be returned.\n\n                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n                of shape `(batch_size, sequence_length)`.\n            inputs_embeds (`Optional[torch.FloatTensor]`):\n                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n                is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n                model\'s internal embedding lookup matrix.\n            labels (`Optional[torch.LongTensor]`):\n                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\n                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\n                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n            use_cache (`Optional[bool]`):\n                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n                `past_key_values`).\n            output_attentions (`Optional[bool]`):\n                Whether or not to return the attentions tensors of all attention layers. See `attentions` under r

GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING = """\nConstructs a fast Gemma3 image processor.\n\nParameters:\n    do_resize (`Optional[bool]`, defaults to `True`):\n        Whether to resize the image.\n    size (`Optional[dict[str, int]]`, defaults to `{\'height\': 224, \'width\': 224}`):\n        Describes the maximum input dimensions to the model.\n    default_to_square (`Optional[bool]`, defaults to `True`):\n        Whether to default to a square image when resizing, if size is an int.\n    resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`, defaults to `2`):\n        Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n        has an effect if `do_resize` is set to `True`.\n    do_center_crop (`Optional[bool]`, defaults to `None`):\n        Whether to center crop the image.\n    crop_size (`Optional[dict[str, int]]`, defaults to `None`):\n        Size of the output image after applying `center_crop`.\n    do_rescale (`Optional[bool]`, defaults to `True`):\n        Whether to rescale the image.\n    rescale_factor (`Union[int, float, NoneType]`, defaults to `0.00392156862745098`):\n        Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n    do_normalize (`Optional[bool]`, defaults to `True`):\n        Whether to normalize the image.\n    image_mean (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n        Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n    image_std (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n        Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n        `True`.\n    do_convert_rgb (`Optional[bool]`, defaults to `None`):\n        Whether to convert the image to RGB.\n    return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`, defaults to `None`):\n        Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n    data_format (`Optional[~image_utils.ChannelDimension]`, defaults to `ChannelDimension.FIRST`):\n        Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n    input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`, defaults to `None`):\n        The channel dimension format for the input image. If unset, the channel dimension format is inferred\n        from the input image. Can be one of:\n        - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n        - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n        - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n    device (`Optional[torch.device]`, defaults to `None`):\n        The device to process the images on. If unset, the device is inferred from the input images.\n    do_pan_and_scan (`Optional[bool]`, defaults to `None`):\n        Whether to apply `pan_and_scan` to images.\n    pan_and_scan_min_crop_size (`Optional[int]`, defaults to `None`):\n        Minimum size of each crop in pan and scan.\n    pan_and_scan_max_num_crops (`Optional[int]`, defaults to `None`):\n        Maximum number of crops per image in pan and scan.\n    pan_and_scan_min_ratio_to_activate (`Optional[float]`, defaults to `None`):\n        Minimum aspect ratio to activate pan and scan.\n"""

GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING = """        Args:\n            images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list[\'PIL.Image.Image\'], list[numpy.ndarray], list[\'torch.Tensor\']]`):\n                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n                passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n            do_resize (`Optional[bool]`):\n                Whether to resize the image.\n            size (`Optional[dict[str, int]]`):\n                Describes the maximum input dimensions to the model.\n            default_to_square (`Optional[bool]`):\n                Whether to default to a square image when resizing, if size is an int.\n            resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n                has an effect if `do_resize` is set to `True`.\n            do_center_crop (`Optional[bool]`):\n                Whether to center crop the image.\n            crop_size (`Optional[dict[str, int]]`):\n                Size of the output image after applying `center_crop`.\n            do_rescale (`Optional[bool]`):\n                Whether to rescale the image.\n            rescale_factor (`Union[int, float, NoneType]`):\n                Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n            do_normalize (`Optional[bool]`):\n                Whether to normalize the image.\n            image_mean (`Union[float, list[float], NoneType]`):\n                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n            image_std (`Union[float, list[float], NoneType]`):\n                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n                `True`.\n            do_convert_rgb (`Optional[bool]`):\n                Whether to convert the image to RGB.\n            return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n                Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n            data_format (`Optional[~image_utils.ChannelDimension]`):\n                Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n            input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n                The channel dimension format for the input image. If unset, the channel dimension format is inferred\n                from the input image. Can be one of:\n                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n            device (`Optional[torch.device]`):\n                The device to process the images on. If unset, the device is inferred from the input images.\n            do_pan_and_scan (`Optional[bool]`):\n                Whether to apply `pan_and_scan` to images.\n            pan_and_scan_min_crop_size (`Optional[int]`):\n                Minimum size of each crop in pan and scan.\n            pan_and_scan_max_num_crops (`Optional[int]`):\n                Maximum number of crops per image in pan and scan.\n            pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n                Minimum aspect ratio to activate pan and scan.\n\n        Returns:\n            `<class \'transformers.image_processing_base.BatchFeature\'>`:\n                - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method (\'pixel_values\', etc.).\n                - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at\n                  initialization.


class AutoDocstringTest(unittest.TestCase):
    pass
    # def test_modeling_docstring(self):
    #     llama_docstring = "        Args:\n            images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor']]`):\n                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n                passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n            do_resize (`Optional[bool]`):\n                Whether to resize the image.\n            size (`Optional[dict[str, int]]`):\n                Describes the maximum input dimensions to the model.\n            default_to_square (`Optional[bool]`):\n                Whether to default to a square image when resizing, if size is an int.\n            resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n                has an effect if `do_resize` is set to `True`.\n            do_center_crop (`Optional[bool]`):\n                Whether to center crop the image.\n            crop_size (`Optional[dict[str, int]]`):\n                Size of the output image after applying `center_crop`.\n            do_rescale (`Optional[bool]`):\n                Whether to rescale the image.\n            rescale_factor (`Union[int, float, NoneType]`):\n                Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n            do_normalize (`Optional[bool]`):\n                Whether to normalize the image.\n            image_mean (`Union[float, list[float], NoneType]`):\n                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n            image_std (`Union[float, list[float], NoneType]`):\n                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n                `True`.\n            do_convert_rgb (`Optional[bool]`):\n                Whether to convert the image to RGB.\n            return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n                Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n            data_format (`Optional[~image_utils.ChannelDimension]`):\n                Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n            input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n                The channel dimension format for the input image. If unset, the channel dimension format is inferred\n                from the input image. Can be one of:\n                - `\"channels_first\"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n                - `\"channels_last\"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n                - `\"none\"` or `ChannelDimension.NONE`: image in (height, width) format.\n            device (`Optional[torch.device]`):\n                The device to process the images on. If unset, the device is inferred from the input images.\n            do_pan_and_scan (`Optional[bool]`):\n                Whether to apply `pan_and_scan` to images.\n            pan_and_scan_min_crop_size (`Optional[int]`):\n                Minimum size of each crop in pan and scan.\n            pan_and_scan_max_num_crops (`Optional[int]`):\n                Maximum number of crops per image in pan and scan.\n            pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n                Minimum aspect ratio to activate pan and scan.\n\n        Returns:\n            `<class 'transformers.image_processing_base.BatchFeature'>`:\n                - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).\n                - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at\n                  initialization.\n"
    #     self.assertEqual(llama_docstring, LlamaModel.__doc__)

    #     self.assertEqual(LLAMA_MODEL_DOCSTRING, LlamaModel.forward.__doc__)
    #     self.assertEqual(LLAMA_CLM_FORWARD, LlamaForCausalLM.forward.__doc__)
    #     self.assertEqual(LLAMA_DECODER, LlamaDecoderLayer.forward.__doc__)
    #     self.assertEqual(LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC, LlamaForSequenceClassification.forward.__doc__)

    # def test_fast_image_processor_docstring(self):
    #     self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING, Gemma3ImageProcessorFast.__doc__)
    #     self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING, Gemma3ImageProcessorFast.preprocess.__doc__)

    # def test_auto_doc(self):
    #     COOL_CLASS_DOC = """
    #     Args:
    #         input_ids (some):
    #         flash_attn_kwargs (FlashAttentionKwrargs):
    #             parameters that are completely optional and that should be passed.
    #         another_warg (something): should pass
    #         and_another_on (this time):
    #             I want
    #             this to be
    #             quite long

    #     Example

    #     ```python
    #     >>> import
    #     ```
    #     """

    #     @auto_docstring
    #     class MyModel:
    #         @auto_docstring
    #         def __init__(input_ids, flash_attn_kwargs=None, another_warg=True, and_another_on=1):
    #             r"""
    #             Args:
    #                 flash_attn_kwargs (FlashAttentionKwrargs):
    #                     parameters that are completely optional and that should be passed.
    #                 another_warg (something): should pass
    #                 and_another_on (this time):
    #                     I want
    #                     this to be
    #                     quite long

    #             Example

    #             ```python
    #             >>> import
    #             ```
    #             """
    #             pass

    #     self.assertEqual(MyModel.__init__.__doc__, COOL_CLASS_DOC)