89 lines
40 KiB
Python
89 lines
40 KiB
Python
|
|
# coding=utf-8
|
||
|
|
# Copyright 2024 The HuggingFace Team Inc.
|
||
|
|
#
|
||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
# you may not use this file except in compliance with the License.
|
||
|
|
# You may obtain a clone of the License at
|
||
|
|
#
|
||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
#
|
||
|
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
# See the License for the specific language governing permissions and
|
||
|
|
# limitations under the License.
|
||
|
|
|
||
|
|
import unittest
|
||
|
|
|
||
|
|
|
||
|
|
LLAMA_CLM_FORWARD = """ The [`LlamaForCausalLM`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n [`PreTrainedTokenizer.__call__`] for details.\n\n [What are input IDs?](../glossary#input-ids)\n attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):\n Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n Two formats are allowed:\n - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n cache format.\n\n The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n legacy cache format will be returned.\n\n If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n of shape `(batch_size, sequence_length)`.\n inputs_embeds (`Optional[torch.FloatTensor]`):\n Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n model\'s internal embedding lookup matrix.\n labels (`Optional[torch.LongTensor]`) of shape `(batch_size, sequence_length)`:\n Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,\n config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored\n (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.\n use_cache (`Optional[bool]`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n output_attentions (`Optional[bool]`):\n Whether or not to return the attentions tensors of all attention layers. S
|
||
|
|
|
||
|
|
LLAMA_MODEL_DOCSTRING = """ The [`LlamaModel`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n [`PreTrainedTokenizer.__call__`] for details.\n\n [What are input IDs?](../glossary#input-ids)\n attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):\n Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n Two formats are allowed:\n - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n cache format.\n\n The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n legacy cache format will be returned.\n\n If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't\n have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n of shape `(batch_size, sequence_length)`.\n inputs_embeds (`Optional[torch.FloatTensor]`):\n Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n model's internal embedding lookup matrix.\n use_cache (`Optional[bool]`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n output_attentions (`Optional[bool]`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n tensors for more detail.\n output_hidden_states (`Optional[bool]`):\n Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for\n more detail.\n cache_position (`Optional[torch.LongTensor]`):\n Indices depicting the position of the input sequence tokens in the sequence. Contraril
|
||
|
|
|
||
|
|
LLAMA_DECODER = """ The [`LlamaDecoderLayer`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim) attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):deprecated in favor of `past_key_values` output_attentions (`Optional[bool]`, defaults to `False`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned\n tensors for more detail.\n use_cache (`Optional[bool]`, defaults to `False`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n cache_position (`Optional[torch.LongTensor]`):\n Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,\n this tensor is not affected by padding. It is used to update the cache in the correct position and to infer\n the complete sequence length.\n position_embeddings (`Optional[Tuple[torch.Tensor, torch.Tensor]]`):\n Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,\n with `head_dim` being the embedding dimension of each attention head.\n\n Returns:\n `Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]`"""
|
||
|
|
|
||
|
|
LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC = """ The [`LlamaForSequenceClassification`] forward method, overrides the `__call__` special method.\n\n <Tip>\n\n Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]\n instance afterwards instead of this since the former takes care of running the pre and post processing steps while\n the latter silently ignores them.\n\n </Tip>\n\n Args:\n input_ids (`Optional[torch.LongTensor]`)of shape `(batch_size, sequence_length)`):\n Indices of input sequence tokens in the vocabulary. Padding will be ignored by default.\n\n Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and\n [`PreTrainedTokenizer.__call__`] for details.\n\n [What are input IDs?](../glossary#input-ids)\n attention_mask (`Optional[torch.Tensor]`) of shape `(batch_size, sequence_length)`:\n Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:\n\n - 1 for tokens that are **not masked**,\n - 0 for tokens that are **masked**.\n\n [What are attention masks?](../glossary#attention-mask)\n position_ids (`Optional[torch.LongTensor]`):\n Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.\n\n [What are position IDs?](../glossary#position-ids)\n past_key_values (`Optional[~cache_utils.Cache]`):\n Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention\n blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`\n returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.\n\n Two formats are allowed:\n - a `~cache_utils.Cache` instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);\n - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of\n shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy\n cache format.\n\n The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the\n legacy cache format will be returned.\n\n If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don\'t\n have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`\n of shape `(batch_size, sequence_length)`.\n inputs_embeds (`Optional[torch.FloatTensor]`):\n Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This\n is useful if you want more control over how to convert `input_ids` indices into associated vectors than the\n model\'s internal embedding lookup matrix.\n labels (`Optional[torch.LongTensor]`):\n Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\n config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\n `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n use_cache (`Optional[bool]`):\n If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n `past_key_values`).\n output_attentions (`Optional[bool]`):\n Whether or not to return the attentions tensors of all attention layers. See `attentions` under r
|
||
|
|
|
||
|
|
GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING = """\nConstructs a fast Gemma3 image processor.\n\nParameters:\n do_resize (`Optional[bool]`, defaults to `True`):\n Whether to resize the image.\n size (`Optional[dict[str, int]]`, defaults to `{\'height\': 224, \'width\': 224}`):\n Describes the maximum input dimensions to the model.\n default_to_square (`Optional[bool]`, defaults to `True`):\n Whether to default to a square image when resizing, if size is an int.\n resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`, defaults to `2`):\n Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n has an effect if `do_resize` is set to `True`.\n do_center_crop (`Optional[bool]`, defaults to `None`):\n Whether to center crop the image.\n crop_size (`Optional[dict[str, int]]`, defaults to `None`):\n Size of the output image after applying `center_crop`.\n do_rescale (`Optional[bool]`, defaults to `True`):\n Whether to rescale the image.\n rescale_factor (`Union[int, float, NoneType]`, defaults to `0.00392156862745098`):\n Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n do_normalize (`Optional[bool]`, defaults to `True`):\n Whether to normalize the image.\n image_mean (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n image_std (`Union[float, list[float], NoneType]`, defaults to `[0.5, 0.5, 0.5]`):\n Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n `True`.\n do_convert_rgb (`Optional[bool]`, defaults to `None`):\n Whether to convert the image to RGB.\n return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`, defaults to `None`):\n Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n data_format (`Optional[~image_utils.ChannelDimension]`, defaults to `ChannelDimension.FIRST`):\n Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`, defaults to `None`):\n The channel dimension format for the input image. If unset, the channel dimension format is inferred\n from the input image. Can be one of:\n - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n device (`Optional[torch.device]`, defaults to `None`):\n The device to process the images on. If unset, the device is inferred from the input images.\n do_pan_and_scan (`Optional[bool]`, defaults to `None`):\n Whether to apply `pan_and_scan` to images.\n pan_and_scan_min_crop_size (`Optional[int]`, defaults to `None`):\n Minimum size of each crop in pan and scan.\n pan_and_scan_max_num_crops (`Optional[int]`, defaults to `None`):\n Maximum number of crops per image in pan and scan.\n pan_and_scan_min_ratio_to_activate (`Optional[float]`, defaults to `None`):\n Minimum aspect ratio to activate pan and scan.\n"""
|
||
|
|
|
||
|
|
GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING = """ Args:\n images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list[\'PIL.Image.Image\'], list[numpy.ndarray], list[\'torch.Tensor\']]`):\n Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n do_resize (`Optional[bool]`):\n Whether to resize the image.\n size (`Optional[dict[str, int]]`):\n Describes the maximum input dimensions to the model.\n default_to_square (`Optional[bool]`):\n Whether to default to a square image when resizing, if size is an int.\n resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n has an effect if `do_resize` is set to `True`.\n do_center_crop (`Optional[bool]`):\n Whether to center crop the image.\n crop_size (`Optional[dict[str, int]]`):\n Size of the output image after applying `center_crop`.\n do_rescale (`Optional[bool]`):\n Whether to rescale the image.\n rescale_factor (`Union[int, float, NoneType]`):\n Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n do_normalize (`Optional[bool]`):\n Whether to normalize the image.\n image_mean (`Union[float, list[float], NoneType]`):\n Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n image_std (`Union[float, list[float], NoneType]`):\n Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n `True`.\n do_convert_rgb (`Optional[bool]`):\n Whether to convert the image to RGB.\n return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n data_format (`Optional[~image_utils.ChannelDimension]`):\n Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n The channel dimension format for the input image. If unset, the channel dimension format is inferred\n from the input image. Can be one of:\n - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.\n device (`Optional[torch.device]`):\n The device to process the images on. If unset, the device is inferred from the input images.\n do_pan_and_scan (`Optional[bool]`):\n Whether to apply `pan_and_scan` to images.\n pan_and_scan_min_crop_size (`Optional[int]`):\n Minimum size of each crop in pan and scan.\n pan_and_scan_max_num_crops (`Optional[int]`):\n Maximum number of crops per image in pan and scan.\n pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n Minimum aspect ratio to activate pan and scan.\n\n Returns:\n `<class \'transformers.image_processing_base.BatchFeature\'>`:\n - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method (\'pixel_values\', etc.).\n - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at\n initialization.
|
||
|
|
|
||
|
|
|
||
|
|
class AutoDocstringTest(unittest.TestCase):
|
||
|
|
pass
|
||
|
|
# def test_modeling_docstring(self):
|
||
|
|
# llama_docstring = " Args:\n images (`Union[PIL.Image.Image, numpy.ndarray, torch.Tensor, list['PIL.Image.Image'], list[numpy.ndarray], list['torch.Tensor']]`):\n Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If\n passing in images with pixel values between 0 and 1, set `do_rescale=False`.\n do_resize (`Optional[bool]`):\n Whether to resize the image.\n size (`Optional[dict[str, int]]`):\n Describes the maximum input dimensions to the model.\n default_to_square (`Optional[bool]`):\n Whether to default to a square image when resizing, if size is an int.\n resample (`Union[PILImageResampling, F.InterpolationMode, NoneType]`):\n Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only\n has an effect if `do_resize` is set to `True`.\n do_center_crop (`Optional[bool]`):\n Whether to center crop the image.\n crop_size (`Optional[dict[str, int]]`):\n Size of the output image after applying `center_crop`.\n do_rescale (`Optional[bool]`):\n Whether to rescale the image.\n rescale_factor (`Union[int, float, NoneType]`):\n Rescale factor to rescale the image by if `do_rescale` is set to `True`.\n do_normalize (`Optional[bool]`):\n Whether to normalize the image.\n image_mean (`Union[float, list[float], NoneType]`):\n Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.\n image_std (`Union[float, list[float], NoneType]`):\n Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to\n `True`.\n do_convert_rgb (`Optional[bool]`):\n Whether to convert the image to RGB.\n return_tensors (`Union[str, ~utils.generic.TensorType, NoneType]`):\n Returns stacked tensors if set to `pt, otherwise returns a list of tensors.\n data_format (`Optional[~image_utils.ChannelDimension]`):\n Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.\n input_data_format (`Union[str, ~image_utils.ChannelDimension, NoneType]`):\n The channel dimension format for the input image. If unset, the channel dimension format is inferred\n from the input image. Can be one of:\n - `\"channels_first\"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.\n - `\"channels_last\"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.\n - `\"none\"` or `ChannelDimension.NONE`: image in (height, width) format.\n device (`Optional[torch.device]`):\n The device to process the images on. If unset, the device is inferred from the input images.\n do_pan_and_scan (`Optional[bool]`):\n Whether to apply `pan_and_scan` to images.\n pan_and_scan_min_crop_size (`Optional[int]`):\n Minimum size of each crop in pan and scan.\n pan_and_scan_max_num_crops (`Optional[int]`):\n Maximum number of crops per image in pan and scan.\n pan_and_scan_min_ratio_to_activate (`Optional[float]`):\n Minimum aspect ratio to activate pan and scan.\n\n Returns:\n `<class 'transformers.image_processing_base.BatchFeature'>`:\n - **data** (`dict`) -- Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).\n - **tensor_type** (`Union[None, str, TensorType]`, *optional*) -- You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at\n initialization.\n"
|
||
|
|
# self.assertEqual(llama_docstring, LlamaModel.__doc__)
|
||
|
|
|
||
|
|
# self.assertEqual(LLAMA_MODEL_DOCSTRING, LlamaModel.forward.__doc__)
|
||
|
|
# self.assertEqual(LLAMA_CLM_FORWARD, LlamaForCausalLM.forward.__doc__)
|
||
|
|
# self.assertEqual(LLAMA_DECODER, LlamaDecoderLayer.forward.__doc__)
|
||
|
|
# self.assertEqual(LLAMA_FOR_SEQUENCE_CLASSIFICATION_DOC, LlamaForSequenceClassification.forward.__doc__)
|
||
|
|
|
||
|
|
# def test_fast_image_processor_docstring(self):
|
||
|
|
# self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_DOCSTRING, Gemma3ImageProcessorFast.__doc__)
|
||
|
|
# self.assertEqual(GEMMA3_IMAGE_PROCESSOR_FAST_PREPROCESS_DOCSTRING, Gemma3ImageProcessorFast.preprocess.__doc__)
|
||
|
|
|
||
|
|
# def test_auto_doc(self):
|
||
|
|
# COOL_CLASS_DOC = """
|
||
|
|
# Args:
|
||
|
|
# input_ids (some):
|
||
|
|
# flash_attn_kwargs (FlashAttentionKwrargs):
|
||
|
|
# parameters that are completely optional and that should be passed.
|
||
|
|
# another_warg (something): should pass
|
||
|
|
# and_another_on (this time):
|
||
|
|
# I want
|
||
|
|
# this to be
|
||
|
|
# quite long
|
||
|
|
|
||
|
|
# Example
|
||
|
|
|
||
|
|
# ```python
|
||
|
|
# >>> import
|
||
|
|
# ```
|
||
|
|
# """
|
||
|
|
|
||
|
|
# @auto_docstring
|
||
|
|
# class MyModel:
|
||
|
|
# @auto_docstring
|
||
|
|
# def __init__(input_ids, flash_attn_kwargs=None, another_warg=True, and_another_on=1):
|
||
|
|
# r"""
|
||
|
|
# Args:
|
||
|
|
# flash_attn_kwargs (FlashAttentionKwrargs):
|
||
|
|
# parameters that are completely optional and that should be passed.
|
||
|
|
# another_warg (something): should pass
|
||
|
|
# and_another_on (this time):
|
||
|
|
# I want
|
||
|
|
# this to be
|
||
|
|
# quite long
|
||
|
|
|
||
|
|
# Example
|
||
|
|
|
||
|
|
# ```python
|
||
|
|
# >>> import
|
||
|
|
# ```
|
||
|
|
# """
|
||
|
|
# pass
|
||
|
|
|
||
|
|
# self.assertEqual(MyModel.__init__.__doc__, COOL_CLASS_DOC)
|