@@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
146
146
self .LayerNorm = tf .keras .layers .LayerNormalization (epsilon = config .layer_norm_eps , name = "LayerNorm" )
147
147
self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
148
148
149
- def build (self , input_shape = None ):
149
+ def build (self , input_shape : tf . TensorShape ):
150
150
with tf .name_scope ("word_embeddings" ):
151
151
self .weight = self .add_weight (
152
152
name = "weight" ,
@@ -168,12 +168,7 @@ def build(self, input_shape=None):
168
168
initializer = get_initializer (self .initializer_range ),
169
169
)
170
170
171
- if self .built :
172
- return
173
- self .built = True
174
- if getattr (self , "LayerNorm" , None ) is not None :
175
- with tf .name_scope (self .LayerNorm .name ):
176
- self .LayerNorm .build ([None , None , self .config .embedding_size ])
171
+ super ().build (input_shape )
177
172
178
173
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
179
174
def call (
@@ -251,8 +246,6 @@ def __init__(self, config: AlbertConfig, **kwargs):
251
246
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
252
247
self .attention_dropout = tf .keras .layers .Dropout (rate = config .attention_probs_dropout_prob )
253
248
self .output_dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
254
- self .hidden_size = config .hidden_size
255
- self .config = config
256
249
257
250
def transpose_for_scores (self , tensor : tf .Tensor , batch_size : int ) -> tf .Tensor :
258
251
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -314,26 +307,6 @@ def call(
314
307
315
308
return outputs
316
309
317
- def build (self , input_shape = None ):
318
- if self .built :
319
- return
320
- self .built = True
321
- if getattr (self , "query" , None ) is not None :
322
- with tf .name_scope (self .query .name ):
323
- self .query .build (self .config .hidden_size )
324
- if getattr (self , "key" , None ) is not None :
325
- with tf .name_scope (self .key .name ):
326
- self .key .build (self .config .hidden_size )
327
- if getattr (self , "value" , None ) is not None :
328
- with tf .name_scope (self .value .name ):
329
- self .value .build (self .config .hidden_size )
330
- if getattr (self , "dense" , None ) is not None :
331
- with tf .name_scope (self .dense .name ):
332
- self .dense .build (self .config .hidden_size )
333
- if getattr (self , "LayerNorm" , None ) is not None :
334
- with tf .name_scope (self .LayerNorm .name ):
335
- self .LayerNorm .build ([None , None , self .config .hidden_size ])
336
-
337
310
338
311
class TFAlbertLayer (tf .keras .layers .Layer ):
339
312
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -356,9 +329,6 @@ def __init__(self, config: AlbertConfig, **kwargs):
356
329
epsilon = config .layer_norm_eps , name = "full_layer_layer_norm"
357
330
)
358
331
self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
359
- self .intermediate_size = config .intermediate_size
360
- self .hidden_size = config .hidden_size
361
- self .config = config
362
332
363
333
def call (
364
334
self ,
@@ -386,23 +356,6 @@ def call(
386
356
387
357
return outputs
388
358
389
- def build (self , input_shape = None ):
390
- if self .built :
391
- return
392
- self .built = True
393
- if getattr (self , "attention" , None ) is not None :
394
- with tf .name_scope (self .attention .name ):
395
- self .attention .build (None )
396
- if getattr (self , "ffn" , None ) is not None :
397
- with tf .name_scope (self .ffn .name ):
398
- self .ffn .build (self .config .hidden_size )
399
- if getattr (self , "ffn_output" , None ) is not None :
400
- with tf .name_scope (self .ffn_output .name ):
401
- self .ffn_output .build (self .config .intermediate_size )
402
- if getattr (self , "full_layer_layer_norm" , None ) is not None :
403
- with tf .name_scope (self .full_layer_layer_norm .name ):
404
- self .full_layer_layer_norm .build ([None , None , self .config .hidden_size ])
405
-
406
359
407
360
class TFAlbertLayerGroup (tf .keras .layers .Layer ):
408
361
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -446,15 +399,6 @@ def call(
446
399
447
400
return tuple (v for v in [hidden_states , layer_hidden_states , layer_attentions ] if v is not None )
448
401
449
- def build (self , input_shape = None ):
450
- if self .built :
451
- return
452
- self .built = True
453
- if getattr (self , "albert_layers" , None ) is not None :
454
- for layer in self .albert_layers :
455
- with tf .name_scope (layer .name ):
456
- layer .build (None )
457
-
458
402
459
403
class TFAlbertTransformer (tf .keras .layers .Layer ):
460
404
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -472,7 +416,6 @@ def __init__(self, config: AlbertConfig, **kwargs):
472
416
self .albert_layer_groups = [
473
417
TFAlbertLayerGroup (config , name = f"albert_layer_groups_._{ i } " ) for i in range (config .num_hidden_groups )
474
418
]
475
- self .config = config
476
419
477
420
def call (
478
421
self ,
@@ -514,18 +457,6 @@ def call(
514
457
last_hidden_state = hidden_states , hidden_states = all_hidden_states , attentions = all_attentions
515
458
)
516
459
517
- def build (self , input_shape = None ):
518
- if self .built :
519
- return
520
- self .built = True
521
- if getattr (self , "embedding_hidden_mapping_in" , None ) is not None :
522
- with tf .name_scope (self .embedding_hidden_mapping_in .name ):
523
- self .embedding_hidden_mapping_in .build (self .config .embedding_size )
524
- if getattr (self , "albert_layer_groups" , None ) is not None :
525
- for layer in self .albert_layer_groups :
526
- with tf .name_scope (layer .name ):
527
- layer .build (None )
528
-
529
460
530
461
class TFAlbertPreTrainedModel (TFPreTrainedModel ):
531
462
"""
@@ -556,23 +487,14 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
556
487
# The output weights are the same as the input embeddings, but there is
557
488
# an output-only bias for each token.
558
489
self .decoder = input_embeddings
559
- self .hidden_size = config .hidden_size
560
490
561
- def build (self , input_shape = None ):
491
+ def build (self , input_shape : tf . TensorShape ):
562
492
self .bias = self .add_weight (shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "bias" )
563
493
self .decoder_bias = self .add_weight (
564
494
shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "decoder/bias"
565
495
)
566
496
567
- if self .built :
568
- return
569
- self .built = True
570
- if getattr (self , "dense" , None ) is not None :
571
- with tf .name_scope (self .dense .name ):
572
- self .dense .build (self .config .hidden_size )
573
- if getattr (self , "LayerNorm" , None ) is not None :
574
- with tf .name_scope (self .LayerNorm .name ):
575
- self .LayerNorm .build ([None , None , self .config .embedding_size ])
497
+ super ().build (input_shape )
576
498
577
499
def get_output_embeddings (self ) -> tf .keras .layers .Layer :
578
500
return self .decoder
@@ -728,20 +650,6 @@ def call(
728
650
attentions = encoder_outputs .attentions ,
729
651
)
730
652
731
- def build (self , input_shape = None ):
732
- if self .built :
733
- return
734
- self .built = True
735
- if getattr (self , "embeddings" , None ) is not None :
736
- with tf .name_scope (self .embeddings .name ):
737
- self .embeddings .build (None )
738
- if getattr (self , "encoder" , None ) is not None :
739
- with tf .name_scope (self .encoder .name ):
740
- self .encoder .build (None )
741
- if getattr (self , "pooler" , None ) is not None :
742
- with tf .name_scope (self .pooler .name ):
743
- self .pooler .build (None ) # TODO Matt might be wrong
744
-
745
653
746
654
@dataclass
747
655
class TFAlbertForPreTrainingOutput (ModelOutput ):
@@ -917,14 +825,6 @@ def call(
917
825
918
826
return outputs
919
827
920
- def build (self , input_shape = None ):
921
- if self .built :
922
- return
923
- self .built = True
924
- if getattr (self , "albert" , None ) is not None :
925
- with tf .name_scope (self .albert .name ):
926
- self .albert .build (None )
927
-
928
828
929
829
@add_start_docstrings (
930
830
"""
@@ -1021,20 +921,6 @@ def call(
1021
921
attentions = outputs .attentions ,
1022
922
)
1023
923
1024
- def build (self , input_shape = None ):
1025
- if self .built :
1026
- return
1027
- self .built = True
1028
- if getattr (self , "albert" , None ) is not None :
1029
- with tf .name_scope (self .albert .name ):
1030
- self .albert .build (None )
1031
- if getattr (self , "predictions" , None ) is not None :
1032
- with tf .name_scope (self .predictions .name ):
1033
- self .predictions .build (None )
1034
- if getattr (self , "sop_classifier" , None ) is not None :
1035
- with tf .name_scope (self .sop_classifier .name ):
1036
- self .sop_classifier .build (None )
1037
-
1038
924
1039
925
class TFAlbertSOPHead (tf .keras .layers .Layer ):
1040
926
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -1046,23 +932,13 @@ def __init__(self, config: AlbertConfig, **kwargs):
1046
932
kernel_initializer = get_initializer (config .initializer_range ),
1047
933
name = "classifier" ,
1048
934
)
1049
- self .hidden_size = config .hidden_size
1050
- self .config = config
1051
935
1052
936
def call (self , pooled_output : tf .Tensor , training : bool ) -> tf .Tensor :
1053
937
dropout_pooled_output = self .dropout (inputs = pooled_output , training = training )
1054
938
logits = self .classifier (inputs = dropout_pooled_output )
1055
939
1056
940
return logits
1057
941
1058
- def build (self , input_shape = None ):
1059
- if self .built :
1060
- return
1061
- self .built = True
1062
- if getattr (self , "classifier" , None ) is not None :
1063
- with tf .name_scope (self .classifier .name ):
1064
- self .classifier .build (self .config .hidden_size )
1065
-
1066
942
1067
943
@add_start_docstrings ("""Albert Model with a `language modeling` head on top.""" , ALBERT_START_DOCSTRING )
1068
944
class TFAlbertForMaskedLM (TFAlbertPreTrainedModel , TFMaskedLanguageModelingLoss ):
@@ -1159,17 +1035,6 @@ def call(
1159
1035
attentions = outputs .attentions ,
1160
1036
)
1161
1037
1162
- def build (self , input_shape = None ):
1163
- if self .built :
1164
- return
1165
- self .built = True
1166
- if getattr (self , "albert" , None ) is not None :
1167
- with tf .name_scope (self .albert .name ):
1168
- self .albert .build (None )
1169
- if getattr (self , "predictions" , None ) is not None :
1170
- with tf .name_scope (self .predictions .name ):
1171
- self .predictions .build (None )
1172
-
1173
1038
1174
1039
@add_start_docstrings (
1175
1040
"""
@@ -1193,8 +1058,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1193
1058
self .classifier = tf .keras .layers .Dense (
1194
1059
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1195
1060
)
1196
- self .hidden_size = config .hidden_size
1197
- self .config = config
1198
1061
1199
1062
@unpack_inputs
1200
1063
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1254,17 +1117,6 @@ def call(
1254
1117
attentions = outputs .attentions ,
1255
1118
)
1256
1119
1257
- def build (self , input_shape = None ):
1258
- if self .built :
1259
- return
1260
- self .built = True
1261
- if getattr (self , "albert" , None ) is not None :
1262
- with tf .name_scope (self .albert .name ):
1263
- self .albert .build (None )
1264
- if getattr (self , "classifier" , None ) is not None :
1265
- with tf .name_scope (self .classifier .name ):
1266
- self .classifier .build (self .config .hidden_size )
1267
-
1268
1120
1269
1121
@add_start_docstrings (
1270
1122
"""
@@ -1293,8 +1145,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1293
1145
self .classifier = tf .keras .layers .Dense (
1294
1146
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1295
1147
)
1296
- self .hidden_size = config .hidden_size
1297
- self .config = config
1298
1148
1299
1149
@unpack_inputs
1300
1150
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1350,17 +1200,6 @@ def call(
1350
1200
attentions = outputs .attentions ,
1351
1201
)
1352
1202
1353
- def build (self , input_shape = None ):
1354
- if self .built :
1355
- return
1356
- self .built = True
1357
- if getattr (self , "albert" , None ) is not None :
1358
- with tf .name_scope (self .albert .name ):
1359
- self .albert .build (None )
1360
- if getattr (self , "classifier" , None ) is not None :
1361
- with tf .name_scope (self .classifier .name ):
1362
- self .classifier .build (self .config .hidden_size )
1363
-
1364
1203
1365
1204
@add_start_docstrings (
1366
1205
"""
@@ -1382,8 +1221,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1382
1221
self .qa_outputs = tf .keras .layers .Dense (
1383
1222
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "qa_outputs"
1384
1223
)
1385
- self .hidden_size = config .hidden_size
1386
- self .config = config
1387
1224
1388
1225
@unpack_inputs
1389
1226
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1458,17 +1295,6 @@ def call(
1458
1295
attentions = outputs .attentions ,
1459
1296
)
1460
1297
1461
- def build (self , input_shape = None ):
1462
- if self .built :
1463
- return
1464
- self .built = True
1465
- if getattr (self , "albert" , None ) is not None :
1466
- with tf .name_scope (self .albert .name ):
1467
- self .albert .build (None )
1468
- if getattr (self , "qa_outputs" , None ) is not None :
1469
- with tf .name_scope (self .qa_outputs .name ):
1470
- self .qa_outputs .build (self .config .hidden_size )
1471
-
1472
1298
1473
1299
@add_start_docstrings (
1474
1300
"""
@@ -1490,8 +1316,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1490
1316
self .classifier = tf .keras .layers .Dense (
1491
1317
units = 1 , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1492
1318
)
1493
- self .hidden_size = config .hidden_size
1494
- self .config = config
1495
1319
1496
1320
@unpack_inputs
1497
1321
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, num_choices, sequence_length" ))
@@ -1570,14 +1394,3 @@ def call(
1570
1394
hidden_states = outputs .hidden_states ,
1571
1395
attentions = outputs .attentions ,
1572
1396
)
1573
-
1574
- def build (self , input_shape = None ):
1575
- if self .built :
1576
- return
1577
- self .built = True
1578
- if getattr (self , "albert" , None ) is not None :
1579
- with tf .name_scope (self .albert .name ):
1580
- self .albert .build (None )
1581
- if getattr (self , "classifier" , None ) is not None :
1582
- with tf .name_scope (self .classifier .name ):
1583
- self .classifier .build (self .config .hidden_size )
0 commit comments