Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/transformers/models/align/configuration_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class AlignTextConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand Down Expand Up @@ -134,7 +134,7 @@ class AlignVisionConfig(PreTrainedConfig):
initializer_range: float = 0.02
batch_norm_eps: float = 0.001
batch_norm_momentum: float = 0.99
drop_connect_rate: float = 0.2
drop_connect_rate: float | int = 0.2

def __post_init__(self, **kwargs):
self.num_hidden_layers = sum(self.num_block_repeats) * 4
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/altclip/configuration_altclip.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ class AltCLIPTextConfig(PreTrainedConfig):
num_attention_heads: int = 16
intermediate_size: int = 4096
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 514
type_vocab_size: int = 1
initializer_range: float = 0.02
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ class ASTConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.0
attention_probs_dropout_prob: float = 0.0
hidden_dropout_prob: float | int = 0.0
attention_probs_dropout_prob: float | int = 0.0
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
patch_size: int | list[int] | tuple[int, int] = 16
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/beit/configuration_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ class BeitConfig(BackboneConfigMixin, PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.0
attention_probs_dropout_prob: float = 0.0
hidden_dropout_prob: float | int = 0.0
attention_probs_dropout_prob: float | int = 0.0
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
image_size: int | list[int] | tuple[int, int] = 224
Expand All @@ -87,7 +87,7 @@ class BeitConfig(BackboneConfigMixin, PreTrainedConfig):
use_relative_position_bias: bool = False
use_shared_relative_position_bias: bool = False
layer_scale_init_value: float = 0.1
drop_path_rate: float = 0.1
drop_path_rate: float | int = 0.1
use_mean_pooling: bool = True
pool_scales: list[int] | tuple[int, ...] = (1, 2, 3, 6)
use_auxiliary_head: bool = True
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/bert/configuration_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ class BertConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class BertGenerationConfig(PreTrainedConfig):
num_attention_heads: int = 16
intermediate_size: int = 4096
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/big_bird/configuration_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,13 @@ class BigBirdConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu_new"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 4096
type_vocab_size: int = 2
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
use_cache: int = True
use_cache: bool = True
pad_token_id: int | None = 0
bos_token_id: int | None = 1
eos_token_id: int | list[int] | None = 2
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/biogpt/configuration_biogpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class BioGptConfig(PreTrainedConfig):
num_attention_heads: int = 16
intermediate_size: int = 4096
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 1024
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/bit/configuration_bit.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class BitConfig(BackboneConfigMixin, PreTrainedConfig):
hidden_act: str = "relu"
global_padding: str | None = None
num_groups: int = 32
drop_path_rate: float = 0.0
drop_path_rate: float | int = 0.0
embedding_dynamic_padding: bool = False
output_stride: int = 32
width_factor: int = 1
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/blip/configuration_blip.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ class BlipTextConfig(PreTrainedConfig):
max_position_embeddings: int = 512
hidden_act: str = "gelu"
layer_norm_eps: float = 1e-12
hidden_dropout_prob: float = 0.0
attention_probs_dropout_prob: float = 0.0
hidden_dropout_prob: float | int = 0.0
attention_probs_dropout_prob: float | int = 0.0
initializer_range: float = 0.02
bos_token_id: int | None = 30522
eos_token_id: int | list[int] | None = 2
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/blip_2/configuration_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ class Blip2QFormerConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ class BridgeTowerTextConfig(PreTrainedConfig):
initializer_factor: float | int = 1
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 514
type_vocab_size: int = 1
layer_norm_eps: float = 1e-05
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/bros/configuration_bros.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ class BrosConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand All @@ -63,7 +63,7 @@ class BrosConfig(PreTrainedConfig):
dim_bbox: int = 8
bbox_scale: float = 100.0
n_relations: int = 1
classifier_dropout_prob: float = 0.1
classifier_dropout_prob: float | int = 0.1
is_decoder: bool = False
add_cross_attention: bool = False

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/camembert/configuration_camembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ class CamembertConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/canine/configuration_canine.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ class CanineConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 16384
type_vocab_size: int = 16
initializer_range: float = 0.02
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class ChameleonConfig(PreTrainedConfig):
eos_token_id: int | list[int] | None = 2
tie_word_embeddings: bool = False
rope_parameters: RopeParameters | dict | None = None
attention_bias: int | None = False
attention_bias: bool | None = False
attention_dropout: float | int | None = 0.0
model_parallel_size: int | None = 1
swin_norm: bool | None = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ class ChineseCLIPTextConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/clap/configuration_clap.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class ClapTextConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 514
type_vocab_size: int = 1
initializer_factor: float = 1.0
Expand Down Expand Up @@ -123,14 +123,14 @@ class ClapAudioConfig(PreTrainedConfig):
depths: list[int] | tuple[int, ...] = (2, 2, 6, 2)
num_attention_heads: list[int] | tuple[int, ...] = (4, 8, 16, 32)
enable_fusion: bool = False
hidden_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
fusion_type: str | None = None
patch_embed_input_channels: int = 1
flatten_patch_embeds: bool = True
patch_embeds_hidden_size: int = 96
enable_patch_layer_norm: bool = True
drop_path_rate: float = 0.0
attention_probs_dropout_prob: float = 0.0
drop_path_rate: float | int = 0.0
attention_probs_dropout_prob: float | int = 0.0
qkv_bias: bool = True
mlp_ratio: float = 4.0
aff_block_r: int = 4
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/clvp/configuration_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ class ClvpDecoderConfig(PreTrainedConfig):
n_inner: int | None = None
num_mel_attn_blocks: int = 6
activation_function: str = "gelu_new"
resid_pdrop: float = 0.1
embd_pdrop: float = 0.1
resid_pdrop: float | int = 0.1
embd_pdrop: float | int = 0.1
attention_dropout: float | int = 0.1
layer_norm_epsilon: float = 1e-5
initializer_range: float = 0.02
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/codegen/configuration_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ class CodeGenConfig(PreTrainedConfig):
rotary_dim: int = 64
n_inner: int | None = None
activation_function: str = "gelu_new"
resid_pdrop: float = 0.0
embd_pdrop: float = 0.0
attn_pdrop: float = 0.0
resid_pdrop: float | int = 0.0
embd_pdrop: float | int = 0.0
attn_pdrop: float | int = 0.0
layer_norm_epsilon: float = 1e-5
initializer_range: float = 0.02
use_cache: bool = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cohere2/configuration_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class Cohere2Config(PreTrainedConfig):
max_position_embeddings: int = 8192
initializer_range: float = 0.02
layer_norm_eps: float = 1e-5
use_cache: int = True
use_cache: bool = True
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why the tests didn't complain, because we have use_cache set to integer 🤔

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isinstance(True, int) == True 🫠 nice, i'll check if we can upstream this to type validator

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed on hub repo, though it will take time to upsteam it. I'll review and merge your PR today :)

pad_token_id: int | None = 0
bos_token_id: int | None = 5
eos_token_id: int | list[int] | None = 255001
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cohere2/modular_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class Cohere2Config(PreTrainedConfig):
max_position_embeddings: int = 8192
initializer_range: float = 0.02
layer_norm_eps: float = 1e-5
use_cache: int = True
use_cache: bool = True
pad_token_id: int | None = 0
bos_token_id: int | None = 5
eos_token_id: int | list[int] | None = 255001
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/convbert/configuration_convbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ class ConvBertConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/convnext/configuration_convnext.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class ConvNextConfig(BackboneConfigMixin, PreTrainedConfig):
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
layer_scale_init_value: float = 1e-6
drop_path_rate: float = 0.0
drop_path_rate: float | int = 0.0
image_size: int | list[int] | tuple[int, int] = 224
_out_features: list[str] | None = None
_out_indices: list[int] | None = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class ConvNextV2Config(BackboneConfigMixin, PreTrainedConfig):
hidden_act: str = "gelu"
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
drop_path_rate: float = 0.0
drop_path_rate: float | int = 0.0
image_size: int | list[int] | tuple[int, int] = 224
_out_features: list[str] | None = None
_out_indices: list[int] | None = None
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cpmant/configuration_cpmant.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class CpmAntConfig(PreTrainedConfig):
dim_head: int = 128
dim_ff: int = 10240
num_hidden_layers: int = 48
dropout_p: float = 0.0
dropout_p: float | int = 0.0
position_bias_num_buckets: int = 512
position_bias_max_distance: int = 2048
eps: float = 1e-6
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/ctrl/configuration_ctrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ class CTRLConfig(PreTrainedConfig):
dff: int = 8192
n_layer: int = 48
n_head: int = 16
resid_pdrop: float = 0.1
embd_pdrop: float = 0.1
resid_pdrop: float | int = 0.1
embd_pdrop: float | int = 0.1
layer_norm_epsilon: float = 1e-6
initializer_range: float = 0.02
use_cache: bool = True
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dab_detr/configuration_dab_detr.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class DabDetrConfig(PreTrainedConfig):
decoder_layers: int = 6
decoder_ffn_dim: int = 2048
decoder_attention_heads: int = 8
is_encoder_decoder: int = True
is_encoder_decoder: bool = True
activation_function: str = "prelu"
hidden_size: int = 256
dropout: float | int = 0.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,10 @@ class Data2VecAudioConfig(PreTrainedConfig):
num_conv_pos_embedding_groups: int = 16
conv_pos_kernel_size: int = 19
num_conv_pos_embeddings: int = 5
mask_time_prob: float = 0.05
mask_time_prob: float | int = 0.05
mask_time_length: int = 10
mask_time_min_masks: int = 2
mask_feature_prob: float = 0.0
mask_feature_prob: float | int = 0.0
mask_feature_length: int = 10
mask_feature_min_masks: int = 0
ctc_loss_reduction: str = "sum"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ class Data2VecTextConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
hidden_dropout_prob: float | int = 0.1
attention_probs_dropout_prob: float | int = 0.1
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ class Data2VecVisionConfig(PreTrainedConfig):
num_attention_heads: int = 12
intermediate_size: int = 3072
hidden_act: str = "gelu"
hidden_dropout_prob: float = 0.0
attention_probs_dropout_prob: float = 0.0
hidden_dropout_prob: float | int = 0.0
attention_probs_dropout_prob: float | int = 0.0
initializer_range: float = 0.02
layer_norm_eps: float = 1e-12
image_size: int | list[int] | tuple[int, int] = 224
Expand All @@ -77,7 +77,7 @@ class Data2VecVisionConfig(PreTrainedConfig):
use_relative_position_bias: bool = False
use_shared_relative_position_bias: bool = False
layer_scale_init_value: float = 0.1
drop_path_rate: float = 0.1
drop_path_rate: float | int = 0.1
use_mean_pooling: bool = True
out_indices: list[int] | tuple[int, ...] = (3, 5, 7, 11)
pool_scales: list[int] | tuple[int, ...] = (1, 2, 3, 6)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dbrx/configuration_dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class DbrxAttentionConfig(PreTrainedConfig):

base_config_key = "attn_config"

attn_pdrop: float = 0.0
attn_pdrop: float | int = 0.0
clip_qkv: int | float | None = None
kv_n_heads: int = 1

Expand Down
Loading
Loading