@@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
146
146
self .LayerNorm = tf .keras .layers .LayerNormalization (epsilon = config .layer_norm_eps , name = "LayerNorm" )
147
147
self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
148
148
149
- def build (self , input_shape : tf . TensorShape ):
149
+ def build (self , input_shape = None ):
150
150
with tf .name_scope ("word_embeddings" ):
151
151
self .weight = self .add_weight (
152
152
name = "weight" ,
@@ -168,7 +168,12 @@ def build(self, input_shape: tf.TensorShape):
168
168
initializer = get_initializer (self .initializer_range ),
169
169
)
170
170
171
- super ().build (input_shape )
171
+ if self .built :
172
+ return
173
+ self .built = True
174
+ if getattr (self , "LayerNorm" , None ) is not None :
175
+ with tf .name_scope (self .LayerNorm .name ):
176
+ self .LayerNorm .build ([None , None , self .config .embedding_size ])
172
177
173
178
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
174
179
def call (
@@ -246,6 +251,8 @@ def __init__(self, config: AlbertConfig, **kwargs):
246
251
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
247
252
self .attention_dropout = tf .keras .layers .Dropout (rate = config .attention_probs_dropout_prob )
248
253
self .output_dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
254
+ self .hidden_size = config .hidden_size
255
+ self .config = config
249
256
250
257
def transpose_for_scores (self , tensor : tf .Tensor , batch_size : int ) -> tf .Tensor :
251
258
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -307,6 +314,26 @@ def call(
307
314
308
315
return outputs
309
316
317
+ def build (self , input_shape = None ):
318
+ if self .built :
319
+ return
320
+ self .built = True
321
+ if getattr (self , "query" , None ) is not None :
322
+ with tf .name_scope (self .query .name ):
323
+ self .query .build (self .config .hidden_size )
324
+ if getattr (self , "key" , None ) is not None :
325
+ with tf .name_scope (self .key .name ):
326
+ self .key .build (self .config .hidden_size )
327
+ if getattr (self , "value" , None ) is not None :
328
+ with tf .name_scope (self .value .name ):
329
+ self .value .build (self .config .hidden_size )
330
+ if getattr (self , "dense" , None ) is not None :
331
+ with tf .name_scope (self .dense .name ):
332
+ self .dense .build (self .config .hidden_size )
333
+ if getattr (self , "LayerNorm" , None ) is not None :
334
+ with tf .name_scope (self .LayerNorm .name ):
335
+ self .LayerNorm .build ([None , None , self .config .hidden_size ])
336
+
310
337
311
338
class TFAlbertLayer (tf .keras .layers .Layer ):
312
339
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -329,6 +356,9 @@ def __init__(self, config: AlbertConfig, **kwargs):
329
356
epsilon = config .layer_norm_eps , name = "full_layer_layer_norm"
330
357
)
331
358
self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
359
+ self .intermediate_size = config .intermediate_size
360
+ self .hidden_size = config .hidden_size
361
+ self .config = config
332
362
333
363
def call (
334
364
self ,
@@ -356,6 +386,23 @@ def call(
356
386
357
387
return outputs
358
388
389
+ def build (self , input_shape = None ):
390
+ if self .built :
391
+ return
392
+ self .built = True
393
+ if getattr (self , "attention" , None ) is not None :
394
+ with tf .name_scope (self .attention .name ):
395
+ self .attention .build (None )
396
+ if getattr (self , "ffn" , None ) is not None :
397
+ with tf .name_scope (self .ffn .name ):
398
+ self .ffn .build (self .config .hidden_size )
399
+ if getattr (self , "ffn_output" , None ) is not None :
400
+ with tf .name_scope (self .ffn_output .name ):
401
+ self .ffn_output .build (self .config .intermediate_size )
402
+ if getattr (self , "full_layer_layer_norm" , None ) is not None :
403
+ with tf .name_scope (self .full_layer_layer_norm .name ):
404
+ self .full_layer_layer_norm .build ([None , None , self .config .hidden_size ])
405
+
359
406
360
407
class TFAlbertLayerGroup (tf .keras .layers .Layer ):
361
408
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -399,6 +446,15 @@ def call(
399
446
400
447
return tuple (v for v in [hidden_states , layer_hidden_states , layer_attentions ] if v is not None )
401
448
449
+ def build (self , input_shape = None ):
450
+ if self .built :
451
+ return
452
+ self .built = True
453
+ if getattr (self , "albert_layers" , None ) is not None :
454
+ for layer in self .albert_layers :
455
+ with tf .name_scope (layer .name ):
456
+ layer .build (None )
457
+
402
458
403
459
class TFAlbertTransformer (tf .keras .layers .Layer ):
404
460
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -416,6 +472,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
416
472
self .albert_layer_groups = [
417
473
TFAlbertLayerGroup (config , name = f"albert_layer_groups_._{ i } " ) for i in range (config .num_hidden_groups )
418
474
]
475
+ self .config = config
419
476
420
477
def call (
421
478
self ,
@@ -457,6 +514,18 @@ def call(
457
514
last_hidden_state = hidden_states , hidden_states = all_hidden_states , attentions = all_attentions
458
515
)
459
516
517
+ def build (self , input_shape = None ):
518
+ if self .built :
519
+ return
520
+ self .built = True
521
+ if getattr (self , "embedding_hidden_mapping_in" , None ) is not None :
522
+ with tf .name_scope (self .embedding_hidden_mapping_in .name ):
523
+ self .embedding_hidden_mapping_in .build (self .config .embedding_size )
524
+ if getattr (self , "albert_layer_groups" , None ) is not None :
525
+ for layer in self .albert_layer_groups :
526
+ with tf .name_scope (layer .name ):
527
+ layer .build (None )
528
+
460
529
461
530
class TFAlbertPreTrainedModel (TFPreTrainedModel ):
462
531
"""
@@ -487,14 +556,23 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
487
556
# The output weights are the same as the input embeddings, but there is
488
557
# an output-only bias for each token.
489
558
self .decoder = input_embeddings
559
+ self .hidden_size = config .hidden_size
490
560
491
- def build (self , input_shape : tf . TensorShape ):
561
+ def build (self , input_shape = None ):
492
562
self .bias = self .add_weight (shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "bias" )
493
563
self .decoder_bias = self .add_weight (
494
564
shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "decoder/bias"
495
565
)
496
566
497
- super ().build (input_shape )
567
+ if self .built :
568
+ return
569
+ self .built = True
570
+ if getattr (self , "dense" , None ) is not None :
571
+ with tf .name_scope (self .dense .name ):
572
+ self .dense .build (self .config .hidden_size )
573
+ if getattr (self , "LayerNorm" , None ) is not None :
574
+ with tf .name_scope (self .LayerNorm .name ):
575
+ self .LayerNorm .build ([None , None , self .config .embedding_size ])
498
576
499
577
def get_output_embeddings (self ) -> tf .keras .layers .Layer :
500
578
return self .decoder
@@ -650,6 +728,20 @@ def call(
650
728
attentions = encoder_outputs .attentions ,
651
729
)
652
730
731
+ def build (self , input_shape = None ):
732
+ if self .built :
733
+ return
734
+ self .built = True
735
+ if getattr (self , "embeddings" , None ) is not None :
736
+ with tf .name_scope (self .embeddings .name ):
737
+ self .embeddings .build (None )
738
+ if getattr (self , "encoder" , None ) is not None :
739
+ with tf .name_scope (self .encoder .name ):
740
+ self .encoder .build (None )
741
+ if getattr (self , "pooler" , None ) is not None :
742
+ with tf .name_scope (self .pooler .name ):
743
+ self .pooler .build (None ) # TODO Matt might be wrong
744
+
653
745
654
746
@dataclass
655
747
class TFAlbertForPreTrainingOutput (ModelOutput ):
@@ -825,6 +917,14 @@ def call(
825
917
826
918
return outputs
827
919
920
+ def build (self , input_shape = None ):
921
+ if self .built :
922
+ return
923
+ self .built = True
924
+ if getattr (self , "albert" , None ) is not None :
925
+ with tf .name_scope (self .albert .name ):
926
+ self .albert .build (None )
927
+
828
928
829
929
@add_start_docstrings (
830
930
"""
@@ -921,6 +1021,20 @@ def call(
921
1021
attentions = outputs .attentions ,
922
1022
)
923
1023
1024
+ def build (self , input_shape = None ):
1025
+ if self .built :
1026
+ return
1027
+ self .built = True
1028
+ if getattr (self , "albert" , None ) is not None :
1029
+ with tf .name_scope (self .albert .name ):
1030
+ self .albert .build (None )
1031
+ if getattr (self , "predictions" , None ) is not None :
1032
+ with tf .name_scope (self .predictions .name ):
1033
+ self .predictions .build (None )
1034
+ if getattr (self , "sop_classifier" , None ) is not None :
1035
+ with tf .name_scope (self .sop_classifier .name ):
1036
+ self .sop_classifier .build (None )
1037
+
924
1038
925
1039
class TFAlbertSOPHead (tf .keras .layers .Layer ):
926
1040
def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -932,13 +1046,23 @@ def __init__(self, config: AlbertConfig, **kwargs):
932
1046
kernel_initializer = get_initializer (config .initializer_range ),
933
1047
name = "classifier" ,
934
1048
)
1049
+ self .hidden_size = config .hidden_size
1050
+ self .config = config
935
1051
936
1052
def call (self , pooled_output : tf .Tensor , training : bool ) -> tf .Tensor :
937
1053
dropout_pooled_output = self .dropout (inputs = pooled_output , training = training )
938
1054
logits = self .classifier (inputs = dropout_pooled_output )
939
1055
940
1056
return logits
941
1057
1058
+ def build (self , input_shape = None ):
1059
+ if self .built :
1060
+ return
1061
+ self .built = True
1062
+ if getattr (self , "classifier" , None ) is not None :
1063
+ with tf .name_scope (self .classifier .name ):
1064
+ self .classifier .build (self .config .hidden_size )
1065
+
942
1066
943
1067
@add_start_docstrings ("""Albert Model with a `language modeling` head on top.""" , ALBERT_START_DOCSTRING )
944
1068
class TFAlbertForMaskedLM (TFAlbertPreTrainedModel , TFMaskedLanguageModelingLoss ):
@@ -1035,6 +1159,17 @@ def call(
1035
1159
attentions = outputs .attentions ,
1036
1160
)
1037
1161
1162
+ def build (self , input_shape = None ):
1163
+ if self .built :
1164
+ return
1165
+ self .built = True
1166
+ if getattr (self , "albert" , None ) is not None :
1167
+ with tf .name_scope (self .albert .name ):
1168
+ self .albert .build (None )
1169
+ if getattr (self , "predictions" , None ) is not None :
1170
+ with tf .name_scope (self .predictions .name ):
1171
+ self .predictions .build (None )
1172
+
1038
1173
1039
1174
@add_start_docstrings (
1040
1175
"""
@@ -1058,6 +1193,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1058
1193
self .classifier = tf .keras .layers .Dense (
1059
1194
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1060
1195
)
1196
+ self .hidden_size = config .hidden_size
1197
+ self .config = config
1061
1198
1062
1199
@unpack_inputs
1063
1200
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1117,6 +1254,17 @@ def call(
1117
1254
attentions = outputs .attentions ,
1118
1255
)
1119
1256
1257
+ def build (self , input_shape = None ):
1258
+ if self .built :
1259
+ return
1260
+ self .built = True
1261
+ if getattr (self , "albert" , None ) is not None :
1262
+ with tf .name_scope (self .albert .name ):
1263
+ self .albert .build (None )
1264
+ if getattr (self , "classifier" , None ) is not None :
1265
+ with tf .name_scope (self .classifier .name ):
1266
+ self .classifier .build (self .config .hidden_size )
1267
+
1120
1268
1121
1269
@add_start_docstrings (
1122
1270
"""
@@ -1145,6 +1293,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1145
1293
self .classifier = tf .keras .layers .Dense (
1146
1294
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1147
1295
)
1296
+ self .hidden_size = config .hidden_size
1297
+ self .config = config
1148
1298
1149
1299
@unpack_inputs
1150
1300
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1200,6 +1350,17 @@ def call(
1200
1350
attentions = outputs .attentions ,
1201
1351
)
1202
1352
1353
+ def build (self , input_shape = None ):
1354
+ if self .built :
1355
+ return
1356
+ self .built = True
1357
+ if getattr (self , "albert" , None ) is not None :
1358
+ with tf .name_scope (self .albert .name ):
1359
+ self .albert .build (None )
1360
+ if getattr (self , "classifier" , None ) is not None :
1361
+ with tf .name_scope (self .classifier .name ):
1362
+ self .classifier .build (self .config .hidden_size )
1363
+
1203
1364
1204
1365
@add_start_docstrings (
1205
1366
"""
@@ -1221,6 +1382,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1221
1382
self .qa_outputs = tf .keras .layers .Dense (
1222
1383
units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "qa_outputs"
1223
1384
)
1385
+ self .hidden_size = config .hidden_size
1386
+ self .config = config
1224
1387
1225
1388
@unpack_inputs
1226
1389
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1295,6 +1458,17 @@ def call(
1295
1458
attentions = outputs .attentions ,
1296
1459
)
1297
1460
1461
+ def build (self , input_shape = None ):
1462
+ if self .built :
1463
+ return
1464
+ self .built = True
1465
+ if getattr (self , "albert" , None ) is not None :
1466
+ with tf .name_scope (self .albert .name ):
1467
+ self .albert .build (None )
1468
+ if getattr (self , "qa_outputs" , None ) is not None :
1469
+ with tf .name_scope (self .qa_outputs .name ):
1470
+ self .qa_outputs .build (self .config .hidden_size )
1471
+
1298
1472
1299
1473
@add_start_docstrings (
1300
1474
"""
@@ -1316,6 +1490,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
1316
1490
self .classifier = tf .keras .layers .Dense (
1317
1491
units = 1 , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
1318
1492
)
1493
+ self .hidden_size = config .hidden_size
1494
+ self .config = config
1319
1495
1320
1496
@unpack_inputs
1321
1497
@add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, num_choices, sequence_length" ))
@@ -1394,3 +1570,14 @@ def call(
1394
1570
hidden_states = outputs .hidden_states ,
1395
1571
attentions = outputs .attentions ,
1396
1572
)
1573
+
1574
+ def build (self , input_shape = None ):
1575
+ if self .built :
1576
+ return
1577
+ self .built = True
1578
+ if getattr (self , "albert" , None ) is not None :
1579
+ with tf .name_scope (self .albert .name ):
1580
+ self .albert .build (None )
1581
+ if getattr (self , "classifier" , None ) is not None :
1582
+ with tf .name_scope (self .classifier .name ):
1583
+ self .classifier .build (self .config .hidden_size )
0 commit comments