@@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
146
146
self .LayerNorm = tf .keras .layers .LayerNormalization (epsilon = config .layer_norm_eps , name = "LayerNorm" )
147
147
self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
148
148
149
- def build (self , input_shape : tf . TensorShape ):
149
+ def build (self , input_shape = None ):
150
150
with tf .name_scope ("word_embeddings" ):
151
151
self .weight = self .add_weight (
152
152
name = "weight" ,
@@ -168,7 +168,12 @@ def build(self, input_shape: tf.TensorShape):
168
168
initializer = get_initializer (self .initializer_range ),
169
169
)
170
170
171
- super ().build (input_shape )
171
+ if self .built :
172
+ return
173
+ self .built = True
174
+ if getattr (self , "LayerNorm" , None ) is not None :
175
+ with tf .name_scope (self .LayerNorm .name ):
176
+ self .LayerNorm .build ([None , None , self .config .embedding_size ])
172
177
173
178
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
174
179
def call (
@@ -246,6 +251,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
246
251
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
247
252
self .attention_dropout = tf .keras .layers .Dropout (rate = config .attention_probs_dropout_prob )
248
253
self .output_dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
254
+ self .config = config
249
255
250
256
def transpose_for_scores (self , tensor : tf .Tensor , batch_size : int ) -> tf .Tensor :
251
257
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -307,6 +313,26 @@ def call(
307
313
308
314
return outputs
309
315
316
+ def build (self , input_shape = None ):
317
+ if self .built :
318
+ return
319
+ self .built = True
320
+ if getattr (self , "query" , None ) is not None :
321
+ with tf .name_scope (self .query .name ):
322
+ self .query .build (self .config .hidden_size )
323
+ if getattr (self , "key" , None ) is not None :
324
+ with tf .name_scope (self .key .name ):
325
+ self .key .build (self .config .hidden_size )
326
+ if getattr (self , "value" , None ) is not None :
327
+ with tf .name_scope (self .value .name ):
328
+ self .value .build (self .config .hidden_size )
329
+ if getattr (self , "dense" , None ) is not None :
330
+ with tf .name_scope (self .dense .name ):
331
+ self .dense .build (self .config .hidden_size )
332
+ if getattr (self , "LayerNorm" , None ) is not None :
333
+ with tf .name_scope (self .LayerNorm .name ):
334
+ self .LayerNorm .build ([None , None , self .config .hidden_size ])
335
+
310
336
311
337
class TFAlbertLayer (tf .keras .layers .Layer ):
312
338
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -329,6 +355,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
329
355
epsilon = config .layer_norm_eps , name = "full_layer_layer_norm"
330
356
)
331
357
self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
358
+ self .config = config
332
359
333
360
def call (
334
361
self ,
@@ -356,6 +383,23 @@ def call(
356
383
357
384
return outputs
358
385
386
+ def build (self , input_shape = None ):
387
+ if self .built :
388
+ return
389
+ self .built = True
390
+ if getattr (self , "attention" , None ) is not None :
391
+ with tf .name_scope (self .attention .name ):
392
+ self .attention .build (None )
393
+ if getattr (self , "ffn" , None ) is not None :
394
+ with tf .name_scope (self .ffn .name ):
395
+ self .ffn .build (self .config .hidden_size )
396
+ if getattr (self , "ffn_output" , None ) is not None :
397
+ with tf .name_scope (self .ffn_output .name ):
398
+ self .ffn_output .build (self .config .intermediate_size )
399
+ if getattr (self , "full_layer_layer_norm" , None ) is not None :
400
+ with tf .name_scope (self .full_layer_layer_norm .name ):
401
+ self .full_layer_layer_norm .build ([None , None , self .config .hidden_size ])
402
+
359
403
360
404
class TFAlbertLayerGroup (tf .keras .layers .Layer ):
361
405
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -399,6 +443,15 @@ def call(
399
443
400
444
return tuple (v for v in [hidden_states , layer_hidden_states , layer_attentions ] if v is not None )
401
445
446
+ def build (self , input_shape = None ):
447
+ if self .built :
448
+ return
449
+ self .built = True
450
+ if getattr (self , "albert_layers" , None ) is not None :
451
+ for layer in self .albert_layers :
452
+ with tf .name_scope (layer .name ):
453
+ layer .build (None )
454
+
402
455
403
456
class TFAlbertTransformer (tf .keras .layers .Layer ):
404
457
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -416,6 +469,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
416
469
self .albert_layer_groups = [
417
470
TFAlbertLayerGroup (config , name = f"albert_layer_groups_._{ i } " ) for i in range (config .num_hidden_groups )
418
471
]
472
+ self .config = config
419
473
420
474
def call (
421
475
self ,
@@ -457,6 +511,18 @@ def call(
457
511
last_hidden_state = hidden_states , hidden_states = all_hidden_states , attentions = all_attentions
458
512
)
459
513
514
+ def build (self , input_shape = None ):
515
+ if self .built :
516
+ return
517
+ self .built = True
518
+ if getattr (self , "embedding_hidden_mapping_in" , None ) is not None :
519
+ with tf .name_scope (self .embedding_hidden_mapping_in .name ):
520
+ self .embedding_hidden_mapping_in .build (self .config .embedding_size )
521
+ if getattr (self , "albert_layer_groups" , None ) is not None :
522
+ for layer in self .albert_layer_groups :
523
+ with tf .name_scope (layer .name ):
524
+ layer .build (None )
525
+
460
526
461
527
class TFAlbertPreTrainedModel (TFPreTrainedModel ):
462
528
"""
@@ -488,13 +554,21 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
488
554
# an output-only bias for each token.
489
555
self .decoder = input_embeddings
490
556
491
- def build (self , input_shape : tf . TensorShape ):
557
+ def build (self , input_shape = None ):
492
558
self .bias = self .add_weight (shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "bias" )
493
559
self .decoder_bias = self .add_weight (
494
560
shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "decoder/bias"
495
561
)
496
562
497
- super ().build (input_shape )
563
+ if self .built :
564
+ return
565
+ self .built = True
566
+ if getattr (self , "dense" , None ) is not None :
567
+ with tf .name_scope (self .dense .name ):
568
+ self .dense .build (self .config .hidden_size )
569
+ if getattr (self , "LayerNorm" , None ) is not None :
570
+ with tf .name_scope (self .LayerNorm .name ):
571
+ self .LayerNorm .build ([None , None , self .config .embedding_size ])
498
572
499
573
def get_output_embeddings (self ) -> tf .keras .layers .Layer :
500
574
return self .decoder
@@ -650,6 +724,20 @@ def call(
650
724
attentions = encoder_outputs .attentions ,
651
725
)
652
726
727
+ def build (self , input_shape = None ):
728
+ if self .built :
729
+ return
730
+ self .built = True
731
+ if getattr (self , "embeddings" , None ) is not None :
732
+ with tf .name_scope (self .embeddings .name ):
733
+ self .embeddings .build (None )
734
+ if getattr (self , "encoder" , None ) is not None :
735
+ with tf .name_scope (self .encoder .name ):
736
+ self .encoder .build (None )
737
+ if getattr (self , "pooler" , None ) is not None :
738
+ with tf .name_scope (self .pooler .name ):
739
+ self .pooler .build (None ) # TODO Matt might be wrong
740
+
653
741
654
742
@dataclass
655
743
class TFAlbertForPreTrainingOutput (ModelOutput ):
@@ -825,6 +913,14 @@ def call(
825
913
826
914
return outputs
827
915
916
+ def build (self , input_shape = None ):
917
+ if self .built :
918
+ return
919
+ self .built = True
920
+ if getattr (self , "albert" , None ) is not None :
921
+ with tf .name_scope (self .albert .name ):
922
+ self .albert .build (None )
923
+
828
924
829
925
@add_start_docstrings (
830
926
"""
@@ -921,6 +1017,20 @@ def call(
921
1017
attentions = outputs .attentions ,
922
1018
)
923
1019
1020
+ def build (self , input_shape = None ):
1021
+ if self .built :
1022
+ return
1023
+ self .built = True
1024
+ if getattr (self , "albert" , None ) is not None :
1025
+ with tf .name_scope (self .albert .name ):
1026
+ self .albert .build (None )
1027
+ if getattr (self , "predictions" , None ) is not None :
1028
+ with tf .name_scope (self .predictions .name ):
1029
+ self .predictions .build (None )
1030
+ if getattr (self , "sop_classifier" , None ) is not None :
1031
+ with tf .name_scope (self .sop_classifier .name ):
1032
+ self .sop_classifier .build (None )
1033
+
924
1034
925
1035
class TFAlbertSOPHead (tf .keras .layers .Layer ):
926
1036
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -932,13 +1042,22 @@ def __init__(self, config: AlbertConfig, **kwargs):
932
1042
kernel_initializer = get_initializer (config .initializer_range ),
933
1043
name = "classifier" ,
934
1044
)
1045
+ self .config = config
935
1046
936
1047
def call (self , pooled_output : tf .Tensor , training : bool ) -> tf .Tensor :
937
1048
dropout_pooled_output = self .dropout (inputs = pooled_output , training = training )
938
1049
logits = self .classifier (inputs = dropout_pooled_output )
939
1050
940
1051
return logits
941
1052
1053
+ def build (self , input_shape = None ):
1054
+ if self .built :
1055
+ return
1056
+ self .built = True
1057
+ if getattr (self , "classifier" , None ) is not None :
1058
+ with tf .name_scope (self .classifier .name ):
1059
+ self .classifier .build (self .config .hidden_size )
1060
+
942
1061
943
1062
@add_start_docstrings ("""Albert Model with a `language modeling` head on top.""" , ALBERT_START_DOCSTRING )
944
1063
class TFAlbertForMaskedLM (TFAlbertPreTrainedModel , TFMaskedLanguageModelingLoss ):
@@ -1035,6 +1154,17 @@ def call(
1035
1154
attentions = outputs .attentions ,
1036
1155
)
1037
1156
1157
+ def build (self , input_shape = None ):
1158
+ if self .built :
1159
+ return
1160
+ self .built = True
1161
+ if getattr (self , "albert" , None ) is not None :
1162
+ with tf .name_scope (self .albert .name ):
1163
+ self .albert .build (None )
1164
+ if getattr (self , "predictions" , None ) is not None :
1165
+ with tf .name_scope (self .predictions .name ):
1166
+ self .predictions .build (None )
1167
+
1038
1168
1039
1169
@add_start_docstrings (
1040
1170
"""
@@ -1058,6 +1188,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1058
1188
self .classifier = tf .keras .layers .Dense (
1059
1189
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1060
1190
)
1191
+ self .config = config
1061
1192
1062
1193
@unpack_inputs
1063
1194
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1117,6 +1248,17 @@ def call(
1117
1248
attentions = outputs .attentions ,
1118
1249
)
1119
1250
1251
+ def build (self , input_shape = None ):
1252
+ if self .built :
1253
+ return
1254
+ self .built = True
1255
+ if getattr (self , "albert" , None ) is not None :
1256
+ with tf .name_scope (self .albert .name ):
1257
+ self .albert .build (None )
1258
+ if getattr (self , "classifier" , None ) is not None :
1259
+ with tf .name_scope (self .classifier .name ):
1260
+ self .classifier .build (self .config .hidden_size )
1261
+
1120
1262
1121
1263
@add_start_docstrings (
1122
1264
"""
@@ -1145,6 +1287,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1145
1287
self .classifier = tf .keras .layers .Dense (
1146
1288
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1147
1289
)
1290
+ self .config = config
1148
1291
1149
1292
@unpack_inputs
1150
1293
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1200,6 +1343,17 @@ def call(
1200
1343
attentions = outputs .attentions ,
1201
1344
)
1202
1345
1346
+ def build (self , input_shape = None ):
1347
+ if self .built :
1348
+ return
1349
+ self .built = True
1350
+ if getattr (self , "albert" , None ) is not None :
1351
+ with tf .name_scope (self .albert .name ):
1352
+ self .albert .build (None )
1353
+ if getattr (self , "classifier" , None ) is not None :
1354
+ with tf .name_scope (self .classifier .name ):
1355
+ self .classifier .build (self .config .hidden_size )
1356
+
1203
1357
1204
1358
@add_start_docstrings (
1205
1359
"""
@@ -1221,6 +1375,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1221
1375
self .qa_outputs = tf .keras .layers .Dense (
1222
1376
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "qa_outputs"
1223
1377
)
1378
+ self .config = config
1224
1379
1225
1380
@unpack_inputs
1226
1381
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1295,6 +1450,17 @@ def call(
1295
1450
attentions = outputs .attentions ,
1296
1451
)
1297
1452
1453
+ def build (self , input_shape = None ):
1454
+ if self .built :
1455
+ return
1456
+ self .built = True
1457
+ if getattr (self , "albert" , None ) is not None :
1458
+ with tf .name_scope (self .albert .name ):
1459
+ self .albert .build (None )
1460
+ if getattr (self , "qa_outputs" , None ) is not None :
1461
+ with tf .name_scope (self .qa_outputs .name ):
1462
+ self .qa_outputs .build (self .config .hidden_size )
1463
+
1298
1464
1299
1465
@add_start_docstrings (
1300
1466
"""
@@ -1316,6 +1482,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1316
1482
self .classifier = tf .keras .layers .Dense (
1317
1483
units = 1 , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1318
1484
)
1485
+ self .config = config
1319
1486
1320
1487
@unpack_inputs
1321
1488
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, num_choices, sequence_length" ))
@@ -1394,3 +1561,14 @@ def call(
1394
1561
hidden_states = outputs .hidden_states ,
1395
1562
attentions = outputs .attentions ,
1396
1563
)
1564
+
1565
+ def build (self , input_shape = None ):
1566
+ if self .built :
1567
+ return
1568
+ self .built = True
1569
+ if getattr (self , "albert" , None ) is not None :
1570
+ with tf .name_scope (self .albert .name ):
1571
+ self .albert .build (None )
1572
+ if getattr (self , "classifier" , None ) is not None :
1573
+ with tf .name_scope (self .classifier .name ):
1574
+ self .classifier .build (self .config .hidden_size )
0 commit comments